mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 23:03:52 +08:00
Compare commits
1463 Commits
v0.10.1.1
...
wye-refact
| Author | SHA1 | Date | |
|---|---|---|---|
| 920db41128 | |||
| 9ea82ecd25 | |||
| 13e211bbbc | |||
| 2d68bba3cd | |||
| e45271b09c | |||
| 84135b1489 | |||
| 611c23b68f | |||
| c40c0d9c82 | |||
| d8b1f9ccc3 | |||
| fac9b430ec | |||
| c6f384dafd | |||
| 7faf51f1cc | |||
| ff1daf6c8a | |||
| f376868620 | |||
| 564233d550 | |||
| 2bcc745042 | |||
| fa29d31f0d | |||
| 2168fc8fae | |||
| 8d332b3cf6 | |||
| c634415273 | |||
| c81dc099a3 | |||
| edaae1825f | |||
| 5b80f22087 | |||
| ae03f4c010 | |||
| 7e4b1861c3 | |||
| d628fa1e56 | |||
| 6b12b2ee38 | |||
| bbeace233b | |||
| 09b1a5676d | |||
| f35f896e3a | |||
| 218349d760 | |||
| 79b2fe7f19 | |||
| 56d0073f2a | |||
| a06bb9bf36 | |||
| 173c8a9520 | |||
| 2ea7d48656 | |||
| 8db7b7f39c | |||
| 587b30c571 | |||
| 0c76bb2de1 | |||
| 72c5dd0310 | |||
| abc55b1fe5 | |||
| d737c66b95 | |||
| da3a188bdb | |||
| 77e958752b | |||
| c5880cfa4c | |||
| 01888b5cbf | |||
| fa179abde3 | |||
| 5c8a4a2208 | |||
| 06d102ecc8 | |||
| 422f2cca4b | |||
| 3884dce376 | |||
| 00c0b25e82 | |||
| 0655b90d80 | |||
| 83fa298682 | |||
| 5a083ce2ea | |||
| 115019045d | |||
| 93d2be10b6 | |||
| 91e10c725c | |||
| 2ae74a80af | |||
| ac1598d166 | |||
| ce8ee3d9e7 | |||
| d4a83e01bb | |||
| 90529cec41 | |||
| bba7623426 | |||
| d2f544018f | |||
| ed7eb771a3 | |||
| 0944358a90 | |||
| aeff0604bb | |||
| a561b9832d | |||
| e8773e620f | |||
| 63c56cbb25 | |||
| 25e5b9ccec | |||
| b9ed8c9679 | |||
| 9506409fc6 | |||
| fda819837e | |||
| 7c795fdf41 | |||
| 6444f65a2b | |||
| 4c094b339e | |||
| cd0bbf5de2 | |||
| 2b6b859916 | |||
| 04cb503fda | |||
| d437ba32fd | |||
| e734a2a085 | |||
| fd56f2e644 | |||
| 1690954497 | |||
| b3e1846da6 | |||
| 8328d39d40 | |||
| ef318228e7 | |||
| 8ecccdd15f | |||
| bb2e04e41e | |||
| 6083b4d926 | |||
| 493acdb7e2 | |||
| 3c75d3b00c | |||
| 206ab1f0df | |||
| e33579cd96 | |||
| 8c52fccb1a | |||
| ea6144a019 | |||
| b6ea29b721 | |||
| d9f8ded136 | |||
| 02776c0386 | |||
| 8914d52869 | |||
| bf8bb7e250 | |||
| eea2536a35 | |||
| a1898466a6 | |||
| 9dce93e07c | |||
| c0734fc51a | |||
| 034f3a4980 | |||
| 0230cd0afb | |||
| da71651386 | |||
| 0da98ff2eb | |||
| db4a03e2e2 | |||
| e165f980d9 | |||
| ea7cf8db35 | |||
| 1108ffb3e6 | |||
| 0c7cc69e29 | |||
| 6941d53c0c | |||
| 97f1312f8c | |||
| 09b01cd395 | |||
| 4deb9c88ca | |||
| b7973eabe5 | |||
| e7203c2338 | |||
| ae0c35923f | |||
| c692506e10 | |||
| 9555929e13 | |||
| 2405817748 | |||
| 616bce15ce | |||
| c33992154a | |||
| f84b2a0dd0 | |||
| 9f78b9ca84 | |||
| 4e2774f5c3 | |||
| 85d4306047 | |||
| 770a2cf7ae | |||
| ea55445b8d | |||
| b765adccd7 | |||
| 4079a63a86 | |||
| 00eba10dd1 | |||
| 20d1d0e38b | |||
| 70ba2d1ec9 | |||
| eb447aff56 | |||
| cf0a7912ca | |||
| 0b343e3218 | |||
| e40c12696a | |||
| 02ab3860a6 | |||
| 6dee906d2c | |||
| 495f368238 | |||
| 02e87f1893 | |||
| 32cb65b2b6 | |||
| 04384cb9da | |||
| 942fba3823 | |||
| d8fc00d623 | |||
| 7b28ef2bc1 | |||
| 9b4c752106 | |||
| 7d92e508b4 | |||
| e94aabe03d | |||
| 1e5e5d757e | |||
| c7ae7edb33 | |||
| 1cb6005627 | |||
| 3e7f33c801 | |||
| 0b8166aa8f | |||
| 6970fa9937 | |||
| d7cf378359 | |||
| 1171480d88 | |||
| 0f97a2e1db | |||
| a8913725a1 | |||
| 0a4674c871 | |||
| 1a893d188c | |||
| 38c2df831a | |||
| 55971f85c9 | |||
| dbb7782d5b | |||
| 806b292c0e | |||
| 93ba7648d0 | |||
| e7cba8f6b1 | |||
| c4b9864e22 | |||
| dbdea93f46 | |||
| 1356ae0aa8 | |||
| dc191cc5d9 | |||
| ceb346015c | |||
| b6f16d37b0 | |||
| 5157781987 | |||
| f16c440c9f | |||
| 8c1b61bd77 | |||
| e0175fbf01 | |||
| c72298213d | |||
| 41174e2803 | |||
| 6ca8d9753c | |||
| d70c154975 | |||
| 129a643b4c | |||
| d3c732e985 | |||
| fb0eece290 | |||
| 515e30b023 | |||
| 62ae26c870 | |||
| 87ee8535a6 | |||
| ced693e845 | |||
| fa55373af1 | |||
| c761b84d5f | |||
| bc37468b3c | |||
| 067fe8b10e | |||
| 0aea9348cc | |||
| 79586c5449 | |||
| b2d5d42337 | |||
| 74ea69f413 | |||
| e82e3b55f6 | |||
| 9e6628ccfc | |||
| 6ada221271 | |||
| ef160aa08e | |||
| c064c82674 | |||
| 6f97de4e47 | |||
| 3a32aa8a6b | |||
| 1d21080118 | |||
| 1d1436c3f7 | |||
| 37d836081a | |||
| f3a478b55e | |||
| b558c3a8b7 | |||
| 745b204ddc | |||
| b0e9f04bbd | |||
| 80385959af | |||
| a355561291 | |||
| 9659b7e78f | |||
| 34e6a31e40 | |||
| c7ca3c5d2f | |||
| fe6357a780 | |||
| 0cee734ab4 | |||
| 252a0ff8c3 | |||
| 2655d7ab83 | |||
| 91d4299774 | |||
| f7f76a8668 | |||
| 054c8b526f | |||
| 2469b8291b | |||
| 18c20257bf | |||
| a5fa821b96 | |||
| af10a37c6c | |||
| a88371f84e | |||
| d7f6489f50 | |||
| 222411313d | |||
| 22114ffebb | |||
| f3d9099b44 | |||
| 3d940e2c3f | |||
| 686cfd91e3 | |||
| f17d37b006 | |||
| 034c0152db | |||
| fd28c58825 | |||
| 5e16b8c552 | |||
| 6c6e553644 | |||
| 6a437a4178 | |||
| 004eed39ff | |||
| 8b17d2554c | |||
| 94b78f576c | |||
| d8ffa3c5f4 | |||
| c26e7b14d7 | |||
| 12c21d28c1 | |||
| 517a857166 | |||
| b839194931 | |||
| 1d6f767dc4 | |||
| b95429c920 | |||
| 7319686692 | |||
| b3fd4ed80c | |||
| 461aa1463b | |||
| b4a80dad98 | |||
| 61a6443bc3 | |||
| c8071faa5d | |||
| 46ed215d6b | |||
| 0e0d51c9c6 | |||
| 72a5101c7a | |||
| 7d9f44ad2a | |||
| 984bfb4ba7 | |||
| b1f9a1f46a | |||
| 3331ced61b | |||
| b614e0f82b | |||
| 44d6701f70 | |||
| 71566e8afc | |||
| 88d8c72d5f | |||
| 0cb913b0a2 | |||
| f98d4d38c0 | |||
| d5c0f43b86 | |||
| 54174c67f8 | |||
| d1e2d17b57 | |||
| 9914857f2b | |||
| 7441d07360 | |||
| 4ca175ea0b | |||
| c39befcead | |||
| c8ef8a50d2 | |||
| fc90ce79f0 | |||
| 5b4ba2e1e1 | |||
| d7fb5a4ae8 | |||
| f52b991db6 | |||
| 177c37e960 | |||
| 0e54bbe108 | |||
| 6b87ce2ecd | |||
| a986f17028 | |||
| faa58fa791 | |||
| 4ed6b67da3 | |||
| cb825af948 | |||
| 342d17fb7f | |||
| 3c62d28bb9 | |||
| 9596fbd6e5 | |||
| 03585bc79d | |||
| 770cb2e1f8 | |||
| b50fa00537 | |||
| 8e6a5e7dd4 | |||
| faae7a7eab | |||
| d562c2ea09 | |||
| 81ee45298d | |||
| d12433adfc | |||
| 4ebc513fc1 | |||
| 7a8f0a3548 | |||
| 907bbca7b7 | |||
| eb1f43bc82 | |||
| 99eaeebe66 | |||
| 715e24e1b3 | |||
| cf0e250200 | |||
| 0c11617ff1 | |||
| 930e691c65 | |||
| c0f11557e1 | |||
| 0438c65376 | |||
| d8fda7420a | |||
| 86e5b73d71 | |||
| e49561cd91 | |||
| 0e30643147 | |||
| 8ba3b17cc1 | |||
| 8222e2651d | |||
| b672b8c3b8 | |||
| 56201cfb01 | |||
| 9689be1e8e | |||
| 65c4513ad8 | |||
| 5acda4cc71 | |||
| 78f892c373 | |||
| 26da2c6244 | |||
| 0081c6956a | |||
| 6462feef65 | |||
| e9a74500e5 | |||
| 02a3ce2230 | |||
| 9cae377a16 | |||
| 8c5c35c027 | |||
| f97da2c732 | |||
| 02134245a9 | |||
| 2ab27b70f5 | |||
| a500f7cc09 | |||
| 1b75f784b8 | |||
| 0eddd2b528 | |||
| 030774abcf | |||
| 77389d87b2 | |||
| 59659b74c4 | |||
| 3b96eafdb0 | |||
| fb64e67533 | |||
| 215da8510d | |||
| c4a15ee240 | |||
| 3a640b8f74 | |||
| 0a1397c7df | |||
| 921945c81e | |||
| 675fc471bf | |||
| b0ae0ad935 | |||
| e99b286f01 | |||
| 23a7805022 | |||
| e3a3c738b0 | |||
| e41946ecdb | |||
| f071a31ede | |||
| 1b30043f0d | |||
| a0b5617263 | |||
| e6c22d2b2f | |||
| dbb029cfe1 | |||
| 25dd155e60 | |||
| 864bbe36f0 | |||
| e97cf2e32b | |||
| d96a3fc653 | |||
| aac85cc6d6 | |||
| f1e3d031e4 | |||
| 6e9229e919 | |||
| ff54b6bfe3 | |||
| 6dbbecd5b2 | |||
| 6850bfe15c | |||
| d988b84e8e | |||
| 7337ec6c9f | |||
| 90ba32a0bf | |||
| 2a8bd2b93b | |||
| 3968ae72ed | |||
| e55ffe3595 | |||
| 4057e2b162 | |||
| cc494282a9 | |||
| 44be2b7349 | |||
| 104e62fbc8 | |||
| ddf4e1f56f | |||
| cbba9bd0b0 | |||
| 4bc6b5d2c3 | |||
| 8d8de42790 | |||
| ef85a438da | |||
| 2f237d3df4 | |||
| 243c358fa8 | |||
| 1b3aa0f297 | |||
| dba6db9937 | |||
| 5322390f1d | |||
| 5f6a36054a | |||
| e348e1027c | |||
| a815d820ee | |||
| 319966a678 | |||
| b81364a7cd | |||
| 791089df20 | |||
| 71f2b5ddea | |||
| 81e17a1e26 | |||
| ed84bda7a5 | |||
| c7b1c0cf8b | |||
| a31d353b71 | |||
| 80cad257da | |||
| 5fd95c77af | |||
| f6278e3065 | |||
| 9e9b3b4ff9 | |||
| 20235c1822 | |||
| 059a13a3bc | |||
| a6cf307fa8 | |||
| b18dde7478 | |||
| 7cdd90211b | |||
| 86fdd686be | |||
| 171592330b | |||
| 4bb2eb42d4 | |||
| 32d43a5a9e | |||
| d9ba479eee | |||
| 9cfa7697c1 | |||
| 9fc86d2802 | |||
| bc76128565 | |||
| af4dedf6d3 | |||
| dad5f4d16d | |||
| c2fdc71c91 | |||
| e33af1e0c2 | |||
| 0ac65d171b | |||
| 267b4421b7 | |||
| 8f3edbd93f | |||
| 239aef5c9f | |||
| 9d70c103aa | |||
| d897924b45 | |||
| b7c986673d | |||
| 14e1e9b09a | |||
| ea01b17b6f | |||
| 123e7ad492 | |||
| ce65ce2d61 | |||
| d4006bd84d | |||
| 7493472a9b | |||
| 937ab7e85e | |||
| bc997c18ca | |||
| d55c6010ac | |||
| 5051270200 | |||
| 6e94161f94 | |||
| e54a476058 | |||
| 8da7b98366 | |||
| 9da51c77a9 | |||
| d0a1364188 | |||
| 2c3ba7362f | |||
| bfd32678e6 | |||
| e29f599d30 | |||
| b6724e95f8 | |||
| 17b9f3a83d | |||
| 378c68bead | |||
| 67f0418b1d | |||
| 779ed75310 | |||
| abb448b457 | |||
| ae36150ec2 | |||
| 2506ce5189 | |||
| 47fd08aaf9 | |||
| 12aed7e453 | |||
| d90e212a3a | |||
| 2821986450 | |||
| 6c117cff7d | |||
| 7ac67ea525 | |||
| ce75e15373 | |||
| aed16879a9 | |||
| cf278ff3b2 | |||
| 838d7116ba | |||
| 5089fd749c | |||
| a3d087adec | |||
| 058525b997 | |||
| 1dfea5f4a9 | |||
| cea91a32f2 | |||
| a684c0124c | |||
| f2718d2948 | |||
| 825fdb11ad | |||
| 8c1d4acbfe | |||
| 486c5599e3 | |||
| a6149aa587 | |||
| 6c8a3c099b | |||
| 31a8a2a7bc | |||
| 1a0a04dae9 | |||
| 6d8246aaff | |||
| 9d1c50a5ac | |||
| 9a4600e4dc | |||
| 9fac6aa30b | |||
| a53ad626d6 | |||
| 1c3dad22ff | |||
| d2a30a2d93 | |||
| 75fb112d80 | |||
| 38db529f66 | |||
| 064cac7bb7 | |||
| e19bce40a1 | |||
| 505805b645 | |||
| bbdc0f2366 | |||
| dc34059360 | |||
| c4cb0af98a | |||
| 1c3b1634aa | |||
| 2ea50e977a | |||
| b419937c78 | |||
| 5f696c33b1 | |||
| 67244c86f0 | |||
| 072d7e53e5 | |||
| 01a583fea4 | |||
| bc19d75985 | |||
| fbd6523ac0 | |||
| 470484a4f5 | |||
| 21da73343a | |||
| 66072b36db | |||
| 3ed1ec4af2 | |||
| 5a33ae9a3f | |||
| c9ff9e6f0c | |||
| eaffe4486c | |||
| 8ed039d527 | |||
| 37970105fe | |||
| cc935fdd7e | |||
| abdfcd4f3d | |||
| 4f02b77de4 | |||
| 29283e8976 | |||
| 05b044e698 | |||
| aa3f105c59 | |||
| ef7eefe17a | |||
| 350c94deb3 | |||
| f4cd80f944 | |||
| 349e0e3462 | |||
| 81b16a2bc9 | |||
| e111d5b0ae | |||
| a904ea78ea | |||
| b7433ca1a4 | |||
| 5c65a72bb1 | |||
| 9d8a2d86d2 | |||
| 3bc18127ff | |||
| bec060fd99 | |||
| 52bc9d5b3e | |||
| dc2979c585 | |||
| 027d37df38 | |||
| b98219670f | |||
| 32baf1d036 | |||
| 3127274d02 | |||
| 4ac510f484 | |||
| 7fb2a5be28 | |||
| 6c036615dc | |||
| 2fc24e94f9 | |||
| 2c3c1bd07a | |||
| 5963b98b46 | |||
| e6585ddb45 | |||
| 2a4d6412e6 | |||
| e67a79db03 | |||
| 9f882d8791 | |||
| 1a456c7c90 | |||
| fedb75fa27 | |||
| bff2e5f1d6 | |||
| 3c068c637b | |||
| f20c3b0951 | |||
| 883131544f | |||
| ee5fd49150 | |||
| 7ae9887542 | |||
| e3db5ebb66 | |||
| 9d442b7c48 | |||
| eb68c2dcd9 | |||
| 8b32464ac1 | |||
| 99cc41ad50 | |||
| d6a518fdde | |||
| 4aa8c7b047 | |||
| 4b946d693e | |||
| 087c6ffc92 | |||
| 4a2d33e371 | |||
| 8f3616f422 | |||
| 47f670b03b | |||
| dd6a910aac | |||
| 1b962e2457 | |||
| bfe9380161 | |||
| 9fccd04e30 | |||
| 252ada5559 | |||
| e120533d7a | |||
| 2b85697031 | |||
| 544fe76b95 | |||
| bb58dc8c20 | |||
| 0fb2551c23 | |||
| 6c47f6bfa4 | |||
| c15309a730 | |||
| 4a9375fe9d | |||
| 03191cd8f0 | |||
| b77bf34e53 | |||
| dd39baf717 | |||
| 43a62c51be | |||
| ca2d1925ef | |||
| 0f7acdd73c | |||
| 5801e49776 | |||
| 58d4c705a8 | |||
| ea3de5ef0d | |||
| 67532a1a68 | |||
| 5672ba90bd | |||
| dd83a157f1 | |||
| 5a411ef6c4 | |||
| eeb135eb87 | |||
| 3059b9cc6b | |||
| 64ad551878 | |||
| cef32104b4 | |||
| 493b10f8bf | |||
| d119fc8614 | |||
| dbebb7f812 | |||
| 3053a22b33 | |||
| 02d4b85454 | |||
| 86daa875fe | |||
| dcf2f3ec06 | |||
| 218454b9b2 | |||
| f4d6eb95cf | |||
| cd1f885bcf | |||
| d593cf28fa | |||
| faa7a5daac | |||
| 567939953b | |||
| 08369289af | |||
| 73cfb3c5ee | |||
| 4e5affeaa1 | |||
| e4f0b4cd96 | |||
| de3e53a75b | |||
| 85e0df1392 | |||
| 0faf3cc3e8 | |||
| 7ea5c73ad7 | |||
| 27fcfe7bcf | |||
| 68dbde5dbb | |||
| 04ad0dc275 | |||
| 238c4c1705 | |||
| 8c54610265 | |||
| 17871983a2 | |||
| 759ef49b15 | |||
| 5206ab20ba | |||
| 0af3ce1355 | |||
| e1279ef00f | |||
| 2942970d44 | |||
| 3c96e7b8a1 | |||
| b42566f440 | |||
| d96e11167d | |||
| 2891603efd | |||
| de2cc3d867 | |||
| e95084308b | |||
| 7f6f2c1182 | |||
| 5bcc153d7b | |||
| 45bfa49cb8 | |||
| fd2f10546c | |||
| e757a629e7 | |||
| aae725af7c | |||
| 73df49ef3a | |||
| 25aba2b6a3 | |||
| 94b03f88dd | |||
| 49bfc538e4 | |||
| a0b26701c9 | |||
| c4afdb69cc | |||
| b834b4cbf1 | |||
| 740f0647b1 | |||
| 01413e0cf5 | |||
| 0e219cd50b | |||
| 72c99f2a75 | |||
| bf214ca226 | |||
| 2e41f5abca | |||
| bc0f6059a2 | |||
| 8de261b04a | |||
| a0d8b9738d | |||
| 59e17dd4a0 | |||
| 4979eb79da | |||
| a8c0f59973 | |||
| f4a948f33f | |||
| 3f3313981c | |||
| 78818dd1b0 | |||
| 8e5cdcda4e | |||
| 90f3f7d73e | |||
| 6dc8da5dc1 | |||
| 79cbcab871 | |||
| ff68035932 | |||
| 1177dd53e9 | |||
| fc2dbcda8b | |||
| fec347dee1 | |||
| cc3173ae98 | |||
| 3e903b6cb4 | |||
| 973c9d01da | |||
| 15b8fef453 | |||
| cfa3234a5b | |||
| 41ae4a1eab | |||
| 4dad72f0d9 | |||
| 59d7ffc17f | |||
| 1da0f1441d | |||
| 98229db244 | |||
| dbeee3844c | |||
| 30498f2a65 | |||
| abc7989adc | |||
| 9a8966bcc2 | |||
| 5febdc8750 | |||
| 99bfef841f | |||
| 89e08d6d18 | |||
| 7f2ea7074e | |||
| 4fdd6f5cbf | |||
| 8226dd56bf | |||
| 5fe643fc26 | |||
| 7ba32aa60b | |||
| c89ed8de43 | |||
| 3beadc2f25 | |||
| bc636f21a6 | |||
| 017354c0ef | |||
| 010acc6e1e | |||
| c8c42597ab | |||
| 9d2a44606d | |||
| f17c075884 | |||
| b0d1213ac3 | |||
| 57f94e88ea | |||
| 684b6870e1 | |||
| a5b84f1cbf | |||
| 9f04d9d55f | |||
| 4d7c1d531b | |||
| 41f17bf290 | |||
| bcb06d7baf | |||
| 0377802c20 | |||
| 72fc8aa412 | |||
| fdb09c77d6 | |||
| 7a1c4025f1 | |||
| 60a0951924 | |||
| 64d90c3e4f | |||
| 59d5d2c736 | |||
| d21a36f5f9 | |||
| 561a0baee0 | |||
| f592b3174b | |||
| 7920de0a2a | |||
| ddcec289c7 | |||
| e090b7b45b | |||
| 6a50eaa0d3 | |||
| 12a8414d81 | |||
| 880c741bb6 | |||
| 40b6c9122b | |||
| 2e6bc46821 | |||
| fcba05c435 | |||
| 7a30fa8708 | |||
| f82f7a8990 | |||
| c3aea10dc8 | |||
| d4fd2768ef | |||
| 7a70a71892 | |||
| 7d4651997a | |||
| 569bf1c9c0 | |||
| 1ec20355f5 | |||
| e42af78b18 | |||
| 074854b24f | |||
| 79ac59f32e | |||
| b971f91504 | |||
| c733bd5e87 | |||
| a892b259b4 | |||
| 127ded0a9e | |||
| bb2b5126da | |||
| 361ae27f8a | |||
| e26fef8397 | |||
| c1eda615ba | |||
| 4aa23892d6 | |||
| 1fdd5c42d7 | |||
| bcbe2a4d9e | |||
| 51d41265ad | |||
| 4984a291d5 | |||
| 404c85ca72 | |||
| 817beef7f3 | |||
| 4f6593b058 | |||
| 94e6b2d55f | |||
| fd1ce98cdd | |||
| d11ec124a0 | |||
| f510715882 | |||
| f946197473 | |||
| 0cd72a7b72 | |||
| 5f5271f1ee | |||
| d6249d0699 | |||
| 25bb9e8c65 | |||
| a1213fae5f | |||
| a8b0361c92 | |||
| ed5ae4aace | |||
| 0fc36463e0 | |||
| d14c4ebf08 | |||
| ba6011027d | |||
| 85df8afdae | |||
| 6aeb1dab4a | |||
| e93f4cc9e3 | |||
| 2048c4e379 | |||
| d13360183a | |||
| 9bd831f501 | |||
| e2b1f863aa | |||
| 41329a0ff9 | |||
| ee0bc5e1b4 | |||
| 3d1393f6fc | |||
| 8a894084d2 | |||
| e2d8c27f68 | |||
| 29799ddacc | |||
| f17a6aa4ec | |||
| 6c8deacd72 | |||
| 55b823ba0f | |||
| 8c5a747246 | |||
| 5931b7e5d9 | |||
| cc99baf14d | |||
| dcb28a332b | |||
| fba7856581 | |||
| b5e383cd8b | |||
| 9a161307f5 | |||
| 37e8182bfe | |||
| 4db4426404 | |||
| a0933c3bd6 | |||
| 09e68bce34 | |||
| 9fb74c27a7 | |||
| 4032949630 | |||
| 08abfa78ec | |||
| 2bef2d1405 | |||
| 36cacd0958 | |||
| bb3eb80d92 | |||
| fcc0a3130a | |||
| 736569da8d | |||
| 2eb9986a2d | |||
| ccee371e86 | |||
| c0bd6a684a | |||
| 3144d90217 | |||
| 2f5e5c18de | |||
| bd98842c8a | |||
| d6069887c6 | |||
| 492196ed0e | |||
| f4f1a8df22 | |||
| 0b9a612fa3 | |||
| 4c04eef706 | |||
| f36355abfd | |||
| 9e3c3a7df2 | |||
| 6cbd41909e | |||
| 72d30108a0 | |||
| 8b83b93739 | |||
| 9dbefd88e9 | |||
| 7c195d43da | |||
| 0ae43dbf8c | |||
| 267c80d31f | |||
| 77f62613f9 | |||
| feaf202e93 | |||
| 91130ae376 | |||
| e40827280b | |||
| 4377b1ae3b | |||
| 009d689b0c | |||
| 0efdb5c3ba | |||
| 53b42f4102 | |||
| 309d7aa401 | |||
| b4a01aaf95 | |||
| 83dd28aae4 | |||
| f88e84016f | |||
| 3c2156b3af | |||
| 7e7db04310 | |||
| 41f160b974 | |||
| dc625ea6b8 | |||
| b23fb78623 | |||
| 561f38dc3c | |||
| 73e688cb79 | |||
| fb1a8f932a | |||
| 0dc9cbb527 | |||
| b5fb3005a8 | |||
| 15de5ff9ea | |||
| b8a93076d3 | |||
| c3f9773b2c | |||
| 3707cb2505 | |||
| 920ed46b09 | |||
| 15cb047e25 | |||
| 9ad0688e43 | |||
| b9a1c4c8a2 | |||
| 1aa427fdc1 | |||
| 1c63a16b65 | |||
| 922d3b401b | |||
| 19332c0479 | |||
| a55cf41a09 | |||
| 6fb2788163 | |||
| 3d2a2de8f7 | |||
| 1116590b16 | |||
| ccb97338af | |||
| 45c9cb5835 | |||
| e283976f3a | |||
| 46876dff32 | |||
| 1823a00d67 | |||
| ed16d0f26f | |||
| 0cdd213641 | |||
| 948dd3443b | |||
| b2f7745774 | |||
| 82dfb12e52 | |||
| bba1042c6f | |||
| b6fbc15634 | |||
| 3e0d4a3475 | |||
| 562663a044 | |||
| ed1623a88a | |||
| 13b89bd823 | |||
| 22a0070530 | |||
| 170129eb28 | |||
| 955c624915 | |||
| 4f87abdcc6 | |||
| 6910b56da2 | |||
| e10fef0883 | |||
| e680723eba | |||
| 620db1fc58 | |||
| 41183c1fe0 | |||
| 43d9ad03ba | |||
| 7be141b2c5 | |||
| 8d7f39b48c | |||
| cd08636926 | |||
| 3feeeb9fea | |||
| 6f4a82f8b5 | |||
| c44797a4d6 | |||
| 55be93baf5 | |||
| 717fc00e98 | |||
| 01dfb5e982 | |||
| 03dd652c16 | |||
| 9cd76b71ab | |||
| e041314184 | |||
| 5e537f45b4 | |||
| c2a8b08fcd | |||
| f4962a6d55 | |||
| 2f0b833a05 | |||
| 425b04b8f4 | |||
| 60f0843ef8 | |||
| 8a46602606 | |||
| 61aa4b2901 | |||
| 8c892b1831 | |||
| 3bca396f79 | |||
| 3a3e91bdfe | |||
| b3d7e3c845 | |||
| 67841317d1 | |||
| 86173ad593 | |||
| 795b6951cd | |||
| 2e5d21378d | |||
| 0661cb9df3 | |||
| 105d3d62ef | |||
| 62f66be1f7 | |||
| 81c53ef55c | |||
| 75334956c2 | |||
| 77aec83b8c | |||
| e67597545b | |||
| 37a6fa95fd | |||
| 558f0907dc | |||
| 4172235ab7 | |||
| 848562bd49 | |||
| e68dc2f014 | |||
| a3645ed94d | |||
| fb691ee4e7 | |||
| 6024d115cd | |||
| 7555d6b34a | |||
| 00a4e56d8d | |||
| 0eadaeff7e | |||
| 0077c8634e | |||
| b121ca22ad | |||
| eddaafc1c7 | |||
| 305a1cc0d2 | |||
| 6d6c6b05d3 | |||
| 53b19ccdd5 | |||
| 6432739ef1 | |||
| ac201a0eaf | |||
| 3c529fc994 | |||
| 35bf193864 | |||
| 35efa70297 | |||
| cee182b297 | |||
| c954c6629c | |||
| 9dfbeb41e5 | |||
| eedb2a2a10 | |||
| 23a6c5280e | |||
| 7812bcf278 | |||
| 006e7a34ae | |||
| e599e2c65e | |||
| c29fb540ff | |||
| 65e038931d | |||
| 886ccbe5ba | |||
| adc3ddb430 | |||
| 60b755cbcb | |||
| 482e52f56c | |||
| 78336a0c3e | |||
| 94866d7c93 | |||
| 83609ca91d | |||
| e41a0fa377 | |||
| 37241077d5 | |||
| c9f7081f9c | |||
| 16ded21eeb | |||
| 2b30afa442 | |||
| eafa8dcde6 | |||
| 6c7af8110a | |||
| 8f423e5f43 | |||
| 369a079568 | |||
| 402759d472 | |||
| 2c301ee2eb | |||
| 3efb9f4d95 | |||
| 04f3c35cff | |||
| 51d5e9be7d | |||
| e7fc70016f | |||
| 12e1e63cc5 | |||
| 57b1ce94f7 | |||
| cb55ad86fe | |||
| 712b273f65 | |||
| e919d6f549 | |||
| a38f8bd54c | |||
| b5ee1e3261 | |||
| 36c260dad6 | |||
| a43a3f1770 | |||
| 6adaed42f4 | |||
| a742322092 | |||
| 731a6940e3 | |||
| e9b92dcd89 | |||
| fa4311d85f | |||
| 6d80ae83e1 | |||
| 4ba0c587ba | |||
| 6997a25ac6 | |||
| 28f350e147 | |||
| 51383bd472 | |||
| 9c99e4871f | |||
| 70549c1245 | |||
| f0c503f66e | |||
| f38035c123 | |||
| 426cc8629f | |||
| e81d4e69c1 | |||
| 02d411fdb2 | |||
| d7e1e59972 | |||
| c4ed78b14f | |||
| 1bd007f234 | |||
| 136d853e65 | |||
| e32a0e8678 | |||
| 42dc59dbac | |||
| 862f2ef893 | |||
| 2fd1a40a54 | |||
| 930a24144c | |||
| 457e471971 | |||
| d328f7894f | |||
| 98aee612aa | |||
| 598bd74cf8 | |||
| 2417798471 | |||
| 9480ae24e3 | |||
| f399182e8c | |||
| 1c41310584 | |||
| c83c4ff815 | |||
| 0e1759cd54 | |||
| e66ed3e675 | |||
| e0653f6c0b | |||
| 38ba061f6f | |||
| 0a74e9d0f2 | |||
| 8bd5844989 | |||
| ce30dca5c4 | |||
| 2f0bab3f26 | |||
| fad73be1a5 | |||
| 56d04089ef | |||
| 7be0cb8e9e | |||
| 1fa1d6a9a0 | |||
| d59c986444 | |||
| 04d0c60770 | |||
| 2b41cbbf03 | |||
| 0235103cbb | |||
| a344a5aa0a | |||
| 5685370271 | |||
| a0e0efd6bd | |||
| cf91a89dd2 | |||
| 39a22dcaac | |||
| 41c80698b3 | |||
| 7c8271cd1e | |||
| 3e330fcb21 | |||
| d46934b229 | |||
| 107284959a | |||
| dc1a53186d | |||
| 55602bb2e6 | |||
| d7fbc6ddac | |||
| 5438967fbc | |||
| 422e793fa6 | |||
| 1cb39dbcdd | |||
| 437c3ce026 | |||
| 499b074bfd | |||
| ff0e59d83a | |||
| b55713683c | |||
| acc1a6e10a | |||
| 8c742a66d1 | |||
| 183a70967a | |||
| 14b4326b94 | |||
| 752d2e1c36 | |||
| 81eea3d348 | |||
| 9701352e4b | |||
| 749be00a98 | |||
| 5b8077b8ac | |||
| 038e9be4eb | |||
| 68a349114f | |||
| e80bca309e | |||
| fb4983e112 | |||
| 379ea2823a | |||
| 3a6acad431 | |||
| 5490d633ce | |||
| 628d00cd7b | |||
| 4071c76cf3 | |||
| f1bddbd852 | |||
| 9748c5198b | |||
| ee52a32705 | |||
| 8fb85b7bb6 | |||
| 5b31cb1781 | |||
| d660c98c1b | |||
| 5674a40366 | |||
| 8c3e199998 | |||
| 1c26b42296 | |||
| b7adf94c4a | |||
| 4d7fe40fc0 | |||
| 0dc9532065 | |||
| 72a69132dc | |||
| d90d8eb674 | |||
| 0a2f4c0793 | |||
| 1cf3753b90 | |||
| 4f7cde7272 | |||
| 67c14906aa | |||
| 69f46359dd | |||
| d9e00dbd1f | |||
| ad39106b16 | |||
| 2554b27baa | |||
| 934bebf192 | |||
| 885ca6d31d | |||
| 2d0afcc9dc | |||
| b4f9e9631c | |||
| 05d839c19e | |||
| 6597d7a456 | |||
| 5264015d74 | |||
| 98ac0cb32d | |||
| c8b3b299c9 | |||
| 006477e60b | |||
| de533ab2a1 | |||
| 235c9db8a7 | |||
| b668055a11 | |||
| d3d2aad5a2 | |||
| cb293f6a79 | |||
| 7ffbf27239 | |||
| 27e88cee74 | |||
| 16a45b3a28 | |||
| 57d4ede520 | |||
| 04d1dd7f4a | |||
| f32a5bc505 | |||
| 8805ad9fa9 | |||
| 0583578f42 | |||
| db74d60490 | |||
| 95089607fa | |||
| 1f096f9b95 | |||
| 66548f6603 | |||
| d3da2eea54 | |||
| bfab219648 | |||
| a3432f18fd | |||
| 67cee40da0 | |||
| d99c3a4f7b | |||
| 3462c1c522 | |||
| c5d004aaaf | |||
| 11a7fafaa8 | |||
| 186aced5ff | |||
| daa1273b14 | |||
| c07a73317d | |||
| 22feac8e95 | |||
| c8851a4723 | |||
| f48a9af892 | |||
| a11adafdca | |||
| a781e84ec2 | |||
| 1b7b161a09 | |||
| a69693e38f | |||
| 5da4f5d857 | |||
| 321938e9ac | |||
| f9ca2b40a0 | |||
| 082cc07ef8 | |||
| 853c371fc3 | |||
| 8bf6266a17 | |||
| 0585a9e73c | |||
| 3c0ef769ba | |||
| 4e4d017b6f | |||
| dd58932280 | |||
| 52883ed084 | |||
| 4f35be10a9 | |||
| 2b61d2e22f | |||
| 3ce8285d6d | |||
| 83f555f637 | |||
| 841490434a | |||
| 3af47c3cc6 | |||
| 513c1fe255 | |||
| fe8d7b6f03 | |||
| 16dc4052b0 | |||
| 8dd2baa597 | |||
| 5eeef1b908 | |||
| 704432af3c | |||
| a403d0fa41 | |||
| 8c13820f0b | |||
| 9d30de4469 | |||
| 1f7a9c95e4 | |||
| 8f0d7eaea8 | |||
| e03940762b | |||
| 11eddf02f0 | |||
| 04ff1e43fb | |||
| 6578e87365 | |||
| 5bd9f84158 | |||
| 91e382c935 | |||
| 6446677839 | |||
| 69244e67e6 | |||
| 8dbf6ed7be | |||
| 9de25c294b | |||
| fce10dbed5 | |||
| d272415e57 | |||
| 142ac08030 | |||
| 3210264421 | |||
| 644d57d531 | |||
| c905684cfe | |||
| 786835807b | |||
| fecbb7c782 | |||
| 6dab89b8ec | |||
| de02b07db4 | |||
| eb1995167e | |||
| 2c2b140ae8 | |||
| c7c80af084 | |||
| 6891205b16 | |||
| b1625dbe9c | |||
| 585e0bde36 | |||
| 714872f1a9 | |||
| 5f1af97f86 | |||
| c3b0fd1ee6 | |||
| 6421b66bf4 | |||
| 2f13319f47 | |||
| d696f86e7b | |||
| 9816b81f5f | |||
| c37c0af990 | |||
| 9715f7bb0f | |||
| 98aa16ff41 | |||
| 227e231b55 | |||
| 730d0ac8b9 | |||
| 9b0187003e | |||
| 44ac25eae2 | |||
| 7ea22e42d5 | |||
| 9d4183dd2e | |||
| 513298f1b4 | |||
| 379f828fba | |||
| 1fdc732419 | |||
| f58675bfb3 | |||
| 7c04779afa | |||
| f66673a39d | |||
| b78bed1bc5 | |||
| 164b2273c8 | |||
| 2b4fc9bd9b | |||
| ebd5a77bb5 | |||
| 384dd1b0a8 | |||
| fdeb3dac13 | |||
| d52358c1e0 | |||
| 6ace2f72b0 | |||
| b00e69f8ca | |||
| 50fede6634 | |||
| b5d34af328 | |||
| 9b5f64238f | |||
| ff77764f86 | |||
| bfc1edc9f5 | |||
| 3ecbb14b81 | |||
| 7d67a9d9f9 | |||
| 959783fb99 | |||
| ce0e9dbd43 | |||
| b395b3b0a3 | |||
| 6fad29b11b | |||
| 6fd45e7b8a | |||
| 56dcf4e7e9 | |||
| ae067888d6 | |||
| 906e461ed6 | |||
| 2a97ffc33d | |||
| efc88cf64a | |||
| 7b6a837275 | |||
| c34c82b7fe | |||
| 8a044754bd | |||
| 9188ae7cb5 | |||
| 8a3cd90af5 | |||
| 2a167b2eeb | |||
| 0ff902f3b4 | |||
| a9082a4d14 | |||
| e0329ed4b4 | |||
| 6879cd80ae | |||
| e269be2ba2 | |||
| 5c4b6e66fe | |||
| d0a4a3f645 | |||
| ebafb0936d | |||
| 0cb7b065c3 | |||
| 2da02dd0d8 | |||
| d765cf01fe | |||
| 712d0f88d8 | |||
| 49ab23b3cc | |||
| c9abb10489 | |||
| 787cdb3829 | |||
| a5203d04df | |||
| 99f8094400 | |||
| 170e8ea9ea | |||
| a71e4765cc | |||
| 39971db3aa | |||
| 504d914314 | |||
| 47455c424f | |||
| c7fc6b1354 | |||
| ad78868450 | |||
| e2db1164a1 | |||
| 416f05929a | |||
| 5e021b4981 | |||
| 1b9b16649c | |||
| e76e233540 | |||
| a75277285b | |||
| 9dc30b7068 | |||
| 053278a5dc | |||
| c55c028998 | |||
| 65197a5fb3 | |||
| b8f17f5d98 | |||
| d9a55204ba | |||
| b4e9fd811f | |||
| 308fa287a8 | |||
| fa78de9dc3 | |||
| f6818a92cb | |||
| 23c939fd30 | |||
| add1adfec7 | |||
| c80c53a30f | |||
| 24d0c9e6ed | |||
| cc7ae5e7ca | |||
| 0313cf854d | |||
| 0483fabc74 | |||
| da65bec309 | |||
| 4645024d3a | |||
| cd7a3df26f | |||
| 32d2b4064f | |||
| 22cf679aad | |||
| b6d7d34fc6 | |||
| 341923b982 | |||
| 424fb7a5d2 | |||
| 88491c1b6b | |||
| 613a23b57f | |||
| 51a215300b | |||
| ebe14621e3 | |||
| 325aa3dee9 | |||
| a073be6d87 | |||
| 695e7adcd2 | |||
| 281710ef9a | |||
| 808d2e9aa0 | |||
| 285178b3b8 | |||
| 88016c372a | |||
| 998720859c | |||
| 0ba1b54ac6 | |||
| 53415653ff | |||
| 17373dcd93 | |||
| 5964069367 | |||
| de9c085e17 | |||
| 111692bb8c | |||
| 394591e343 | |||
| 3ac849665d | |||
| 0b9cc56fac | |||
| 8896eb72eb | |||
| 19fe1a0510 | |||
| 480bdf5a7b | |||
| 5368f76855 | |||
| 8ef6b8a38c | |||
| 3bbe11cc13 | |||
| c5041f899f | |||
| 8b5fe6eb51 | |||
| 800349c2a5 | |||
| 044931f97b | |||
| 1d353b6352 | |||
| 3496274663 | |||
| 8a19303173 | |||
| 603fbbbce0 | |||
| 10f535c086 | |||
| 48bfb0c9b7 | |||
| f8ce022948 | |||
| 0278f1ac3a | |||
| a482e4e769 | |||
| e0b056e443 | |||
| 79f05e4436 | |||
| f8daddcc4c | |||
| c8e33c72c6 | |||
| d70a16625d | |||
| 5cc54f7c5b | |||
| 0c6e40bbaa | |||
| 2e2000f352 | |||
| 31282401b6 | |||
| 0c31e28e95 | |||
| f571ff8eb6 | |||
| f64ee61d9e | |||
| 8993073dc1 | |||
| 655a09f653 | |||
| f94bf9b924 | |||
| 3663870c72 | |||
| 2461d9e562 | |||
| 7be5d113d8 | |||
| b029de9902 | |||
| bbea1cefdd | |||
| f5aa307d77 | |||
| 4b795020ed | |||
| c86af22f31 | |||
| 10cc12ba66 | |||
| a4fbb32fab | |||
| 1b125004be | |||
| 4fbda0b20c | |||
| 4e51fa8cba | |||
| bf7c99dfc4 | |||
| b95697d731 | |||
| 582bbe6bd7 | |||
| 0cdbf5e61c | |||
| ebe56a0064 | |||
| f77a0802b7 | |||
| c4477f55e5 | |||
| dfd2382039 | |||
| 3b11b26b50 | |||
| d6d13bd49e | |||
| 5efd6905bc | |||
| b17109beea | |||
| 4449235843 | |||
| 38217877aa | |||
| c6d80a7a96 | |||
| 7cd17e22d7 | |||
| 50df09fe13 | |||
| 68fcd3fa73 | |||
| 83e69a09d6 | |||
| 3aa8c10038 | |||
| 103f1ec8d3 | |||
| d983769c41 | |||
| 8fd920924c | |||
| de7b67a023 | |||
| f729023272 | |||
| 1a3079a15e | |||
| 941f56858a | |||
| a634733f67 | |||
| 64ab3c7253 | |||
| e58c5a9768 | |||
| d46d417b58 | |||
| 0167efe20d | |||
| c32e6ad1f6 | |||
| 1630cc8d0f | |||
| 14e2b0730b | |||
| 0f4f0191d8 | |||
| a38b8af4c3 | |||
| 21dce80ea9 | |||
| e61bac87ee | |||
| 80141bbf2f | |||
| b94faf9d50 | |||
| 5b5f350d67 | |||
| f7cf5b512e | |||
| 03d4235fd2 | |||
| d6a1a20973 | |||
| a70d0bd0a3 | |||
| 24f4d1a224 | |||
| 4f510bc2a1 | |||
| 1298c67795 | |||
| 4d9c61993a | |||
| b87cb97a53 | |||
| f856c33ce9 | |||
| 03752dba8f | |||
| 40f26734b9 | |||
| 2c3f557f08 | |||
| 21bcc8263f | |||
| 5bfe0dea7a | |||
| 31fd3265c8 | |||
| 31436e8b4f | |||
| 4efd43e9b4 | |||
| 3c8a787247 | |||
| 01a08739e0 | |||
| fda9537c5e | |||
| 90bbe0a5ad | |||
| e75f342261 | |||
| 78dba404ad | |||
| e9d6a3db69 | |||
| a4454e9401 | |||
| 14006840ea | |||
| 6603288736 | |||
| 95e3095136 | |||
| c9b38be8aa | |||
| 0dd3f4f5ab | |||
| 498259ccce | |||
| 6d25e3fd6e | |||
| ac6eb49de3 | |||
| bf756321c7 | |||
| 0e3bb543f0 | |||
| 569aefd134 | |||
| d3f71f1224 | |||
| 5a30bd10d8 | |||
| 27e8d1ea3e | |||
| 5c79b0d648 | |||
| 5f5664b3e4 | |||
| 89657a557c | |||
| 08d5f7113a | |||
| b2fd0b81e0 | |||
| 9f1c642254 | |||
| 7be3a59d8e | |||
| 8ea0c2753a |
@ -5,11 +5,11 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
|
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
|
||||||
# Note that we have 400 MiB quota, please use it wisely.
|
# Note that we have 800 MiB quota, please use it wisely.
|
||||||
# See https://github.com/pypi/support/issues/3792 .
|
# See https://github.com/pypi/support/issues/6326 .
|
||||||
# Please also sync the value with the one in Dockerfile.
|
# Please also sync the value with the one in Dockerfile.
|
||||||
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
|
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
|
||||||
|
|
||||||
|
|
||||||
def print_top_10_largest_files(zip_file):
|
def print_top_10_largest_files(zip_file):
|
||||||
|
|||||||
@ -8,7 +8,8 @@ template = """<!DOCTYPE html>
|
|||||||
<html>
|
<html>
|
||||||
<body>
|
<body>
|
||||||
<h1>Links for vLLM</h1/>
|
<h1>Links for vLLM</h1/>
|
||||||
<a href="../{wheel_html_escaped}">{wheel}</a><br/>
|
<a href="../{x86_wheel_html_escaped}">{x86_wheel}</a><br/>
|
||||||
|
<a href="../{arm_wheel_html_escaped}">{arm_wheel}</a><br/>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
@ -21,7 +22,25 @@ filename = os.path.basename(args.wheel)
|
|||||||
|
|
||||||
with open("index.html", "w") as f:
|
with open("index.html", "w") as f:
|
||||||
print(f"Generated index.html for {args.wheel}")
|
print(f"Generated index.html for {args.wheel}")
|
||||||
|
# sync the abi tag with .buildkite/scripts/upload-wheels.sh
|
||||||
|
if "x86_64" in filename:
|
||||||
|
x86_wheel = filename
|
||||||
|
arm_wheel = filename.replace("x86_64", "aarch64").replace(
|
||||||
|
"manylinux1", "manylinux2014"
|
||||||
|
)
|
||||||
|
elif "aarch64" in filename:
|
||||||
|
x86_wheel = filename.replace("aarch64", "x86_64").replace(
|
||||||
|
"manylinux2014", "manylinux1"
|
||||||
|
)
|
||||||
|
arm_wheel = filename
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported wheel: {filename}")
|
||||||
# cloudfront requires escaping the '+' character
|
# cloudfront requires escaping the '+' character
|
||||||
f.write(
|
f.write(
|
||||||
template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
|
template.format(
|
||||||
|
x86_wheel=x86_wheel,
|
||||||
|
x86_wheel_html_escaped=x86_wheel.replace("+", "%2B"),
|
||||||
|
arm_wheel=arm_wheel,
|
||||||
|
arm_wheel_html_escaped=arm_wheel.replace("+", "%2B"),
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|||||||
@ -1,12 +0,0 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
|
||||||
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
|
||||||
tasks:
|
|
||||||
- name: "gsm8k"
|
|
||||||
metrics:
|
|
||||||
- name: "exact_match,strict-match"
|
|
||||||
value: 0.419
|
|
||||||
- name: "exact_match,flexible-extract"
|
|
||||||
value: 0.416
|
|
||||||
limit: 1000
|
|
||||||
num_fewshot: 5
|
|
||||||
@ -3,4 +3,3 @@ Meta-Llama-3-70B-Instruct.yaml
|
|||||||
Mixtral-8x7B-Instruct-v0.1.yaml
|
Mixtral-8x7B-Instruct-v0.1.yaml
|
||||||
Qwen2-57B-A14-Instruct.yaml
|
Qwen2-57B-A14-Instruct.yaml
|
||||||
DeepSeek-V2-Lite-Chat.yaml
|
DeepSeek-V2-Lite-Chat.yaml
|
||||||
Meta-Llama-3-8B-QQQ.yaml
|
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
# We can use this script to compute baseline accuracy on GSM for transformers.
|
# We can use this script to compute baseline accuracy on GSM for transformers.
|
||||||
#
|
#
|
||||||
# Make sure you have lm-eval-harness installed:
|
# Make sure you have lm-eval-harness installed:
|
||||||
# pip install lm-eval==0.4.4
|
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo``
|
echo``
|
||||||
|
|||||||
@ -3,7 +3,7 @@
|
|||||||
# We use this for fp8, which HF does not support.
|
# We use this for fp8, which HF does not support.
|
||||||
#
|
#
|
||||||
# Make sure you have lm-eval-harness installed:
|
# Make sure you have lm-eval-harness installed:
|
||||||
# pip install lm-eval==0.4.4
|
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
||||||
|
|
||||||
usage() {
|
usage() {
|
||||||
echo``
|
echo``
|
||||||
|
|||||||
@ -141,7 +141,7 @@ When run, benchmark script generates results under `benchmark/results` folder, a
|
|||||||
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
|
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
|
||||||
If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
|
If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
|
||||||
|
|
||||||
Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output lenght, max concurrency and qps.
|
Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output length, max concurrency and qps.
|
||||||
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
|
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
|
||||||
|
|
||||||
| | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
|
| | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
|
||||||
|
|||||||
@ -8,7 +8,7 @@ This benchmark aims to:
|
|||||||
|
|
||||||
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
|
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
|
||||||
|
|
||||||
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
||||||
|
|
||||||
## Setup
|
## Setup
|
||||||
|
|
||||||
@ -17,7 +17,7 @@ Latest reproduction guilde: [github issue link](https://github.com/vllm-project/
|
|||||||
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
|
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
|
||||||
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
|
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
|
||||||
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
|
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
|
||||||
- *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
|
- *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
|
||||||
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
|
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
|
||||||
- Hardware
|
- Hardware
|
||||||
- 8x Nvidia A100 GPUs
|
- 8x Nvidia A100 GPUs
|
||||||
|
|||||||
@ -3,44 +3,129 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
from importlib import util
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
plotly_found = util.find_spec("plotly.express") is not None
|
||||||
|
|
||||||
|
|
||||||
def compare_data_columns(
|
def compare_data_columns(
|
||||||
files, name_column, data_column, info_cols, drop_column, debug=False
|
files, name_column, data_column, info_cols, drop_column, debug=False
|
||||||
):
|
):
|
||||||
print("\ncompare_data_column: " + data_column)
|
"""
|
||||||
|
Align concatenation by keys derived from info_cols instead of row order.
|
||||||
|
- Pick one canonical key list: subset of info_cols present in ALL files.
|
||||||
|
- For each file: set index to those keys, aggregate duplicates
|
||||||
|
- (mean for metric, first for names).
|
||||||
|
- Concat along axis=1 (indexes align), then reset_index so callers can
|
||||||
|
- group by columns.
|
||||||
|
- If --debug, add a <file_label>_name column per file.
|
||||||
|
"""
|
||||||
|
print("\ncompare_data_column:", data_column)
|
||||||
|
|
||||||
frames = []
|
frames = []
|
||||||
raw_data_cols = []
|
raw_data_cols = []
|
||||||
compare_frames = []
|
compare_frames = []
|
||||||
|
|
||||||
|
# 1) choose a canonical key list from info_cols that exists in ALL files
|
||||||
|
cols_per_file = []
|
||||||
|
for f in files:
|
||||||
|
try:
|
||||||
|
df_tmp = pd.read_json(f, orient="records")
|
||||||
|
except Exception as err:
|
||||||
|
raise ValueError(f"Failed to read {f}") from err
|
||||||
|
cols_per_file.append(set(df_tmp.columns))
|
||||||
|
|
||||||
|
key_cols = [c for c in info_cols if all(c in cset for cset in cols_per_file)]
|
||||||
|
if not key_cols:
|
||||||
|
# soft fallback: use any info_cols present in the first file
|
||||||
|
key_cols = [c for c in info_cols if c in list(cols_per_file[0])]
|
||||||
|
if not key_cols:
|
||||||
|
raise ValueError(
|
||||||
|
"No common key columns found from info_cols across the input files."
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2) build a single "meta" block (keys as columns) once, aligned by the key index
|
||||||
|
meta_added = False
|
||||||
|
|
||||||
for file in files:
|
for file in files:
|
||||||
data_df = pd.read_json(file)
|
df = pd.read_json(file, orient="records")
|
||||||
serving_df = data_df.dropna(subset=[drop_column], ignore_index=True)
|
|
||||||
# Show all info columns in the first couple columns
|
|
||||||
if not frames:
|
|
||||||
for col in info_cols:
|
|
||||||
if col not in serving_df.columns:
|
|
||||||
print(f"Skipping missing column: {col}")
|
|
||||||
continue
|
|
||||||
frames.append(serving_df[col])
|
|
||||||
# only show test name under debug mode
|
|
||||||
if debug is True:
|
|
||||||
serving_df = serving_df.rename(columns={name_column: file + "_name"})
|
|
||||||
frames.append(serving_df[file + "_name"])
|
|
||||||
|
|
||||||
file = "/".join(file.split("/")[:-1])
|
# Keep rows that actually have the compared metric (same as original behavior)
|
||||||
serving_df = serving_df.rename(columns={data_column: file})
|
if drop_column in df.columns:
|
||||||
frames.append(serving_df[file])
|
df = df.dropna(subset=[drop_column], ignore_index=True)
|
||||||
raw_data_cols.append(file)
|
|
||||||
compare_frames.append(serving_df[file])
|
# Stabilize numeric key columns (harmless if missing)
|
||||||
|
for c in (
|
||||||
|
"Input Len",
|
||||||
|
"Output Len",
|
||||||
|
"TP Size",
|
||||||
|
"PP Size",
|
||||||
|
"# of max concurrency.",
|
||||||
|
"qps",
|
||||||
|
):
|
||||||
|
if c in df.columns:
|
||||||
|
df[c] = pd.to_numeric(df[c], errors="coerce")
|
||||||
|
|
||||||
|
# Ensure all key columns exist
|
||||||
|
for c in key_cols:
|
||||||
|
if c not in df.columns:
|
||||||
|
df[c] = pd.NA
|
||||||
|
|
||||||
|
# Set index = key_cols and aggregate duplicates → unique MultiIndex
|
||||||
|
df_idx = df.set_index(key_cols, drop=False)
|
||||||
|
|
||||||
|
# meta (key columns), unique per key
|
||||||
|
meta = df_idx[key_cols]
|
||||||
|
if not meta.index.is_unique:
|
||||||
|
meta = meta.groupby(level=key_cols, dropna=False).first()
|
||||||
|
|
||||||
|
# metric series for this file, aggregated to one row per key
|
||||||
|
file_label = "/".join(file.split("/")[:-1]) or os.path.basename(file)
|
||||||
|
s = df_idx[data_column]
|
||||||
|
if not s.index.is_unique:
|
||||||
|
s = s.groupby(level=key_cols, dropna=False).mean()
|
||||||
|
s.name = file_label # column label like original
|
||||||
|
|
||||||
|
# add meta once (from first file) so keys are the leftmost columns
|
||||||
|
if not meta_added:
|
||||||
|
frames.append(meta)
|
||||||
|
meta_added = True
|
||||||
|
|
||||||
|
# (NEW) debug: aligned test-name column per file
|
||||||
|
if debug and name_column in df_idx.columns:
|
||||||
|
name_s = df_idx[name_column]
|
||||||
|
if not name_s.index.is_unique:
|
||||||
|
name_s = name_s.groupby(level=key_cols, dropna=False).first()
|
||||||
|
name_s.name = f"{file_label}_name"
|
||||||
|
frames.append(name_s)
|
||||||
|
|
||||||
|
frames.append(s)
|
||||||
|
raw_data_cols.append(file_label)
|
||||||
|
compare_frames.append(s)
|
||||||
|
|
||||||
|
# Generalize ratio: for any file N>=2, add ratio (fileN / file1)
|
||||||
if len(compare_frames) >= 2:
|
if len(compare_frames) >= 2:
|
||||||
# Compare numbers among two files
|
base = compare_frames[0]
|
||||||
ratio_df = compare_frames[1] / compare_frames[0]
|
current = compare_frames[-1]
|
||||||
frames.append(ratio_df)
|
ratio = current / base
|
||||||
compare_frames.pop(1)
|
ratio = ratio.mask(base == 0) # avoid inf when baseline is 0
|
||||||
|
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
|
||||||
|
frames.append(ratio)
|
||||||
|
|
||||||
|
# 4) concat on columns with aligned MultiIndex;
|
||||||
|
# then reset_index to return keys as columns
|
||||||
concat_df = pd.concat(frames, axis=1)
|
concat_df = pd.concat(frames, axis=1)
|
||||||
|
concat_df = concat_df.reset_index(drop=True).reset_index()
|
||||||
|
if "index" in concat_df.columns:
|
||||||
|
concat_df = concat_df.drop(columns=["index"])
|
||||||
|
|
||||||
|
# Ensure key/info columns appear first (in your info_cols order)
|
||||||
|
front = [c for c in info_cols if c in concat_df.columns]
|
||||||
|
rest = [c for c in concat_df.columns if c not in front]
|
||||||
|
concat_df = concat_df[front + rest]
|
||||||
|
|
||||||
print(raw_data_cols)
|
print(raw_data_cols)
|
||||||
return concat_df, raw_data_cols
|
return concat_df, raw_data_cols
|
||||||
|
|
||||||
@ -67,6 +152,15 @@ def split_json_by_tp_pp(
|
|||||||
|
|
||||||
df = pd.DataFrame(data)
|
df = pd.DataFrame(data)
|
||||||
|
|
||||||
|
# Keep only "serving" tests
|
||||||
|
name_col = next(
|
||||||
|
(c for c in ["Test name", "test_name", "Test Name"] if c in df.columns), None
|
||||||
|
)
|
||||||
|
if name_col:
|
||||||
|
df = df[
|
||||||
|
df[name_col].astype(str).str.contains(r"serving", case=False, na=False)
|
||||||
|
].copy()
|
||||||
|
|
||||||
# Handle alias column names
|
# Handle alias column names
|
||||||
rename_map = {
|
rename_map = {
|
||||||
"tp_size": "TP Size",
|
"tp_size": "TP Size",
|
||||||
@ -124,7 +218,7 @@ if __name__ == "__main__":
|
|||||||
"--xaxis",
|
"--xaxis",
|
||||||
type=str,
|
type=str,
|
||||||
default="# of max concurrency.",
|
default="# of max concurrency.",
|
||||||
help="column name to use as X Axis in comparision graph",
|
help="column name to use as X Axis in comparison graph",
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
@ -181,7 +275,6 @@ if __name__ == "__main__":
|
|||||||
f"Expected subset: {filtered_info_cols}, "
|
f"Expected subset: {filtered_info_cols}, "
|
||||||
f"but DataFrame has: {list(output_df.columns)}"
|
f"but DataFrame has: {list(output_df.columns)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
output_df_sorted = output_df.sort_values(by=existing_group_cols)
|
output_df_sorted = output_df.sort_values(by=existing_group_cols)
|
||||||
output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
|
output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
|
||||||
for name, group in output_groups:
|
for name, group in output_groups:
|
||||||
@ -189,8 +282,7 @@ if __name__ == "__main__":
|
|||||||
text_file.write(html_msgs_for_data_cols[i])
|
text_file.write(html_msgs_for_data_cols[i])
|
||||||
text_file.write(html)
|
text_file.write(html)
|
||||||
|
|
||||||
if plot is True:
|
if plot and plotly_found:
|
||||||
import pandas as pd
|
|
||||||
import plotly.express as px
|
import plotly.express as px
|
||||||
|
|
||||||
df = group[raw_data_cols]
|
df = group[raw_data_cols]
|
||||||
|
|||||||
@ -181,18 +181,14 @@ launch_vllm_server() {
|
|||||||
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
|
||||||
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
|
||||||
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
|
||||||
server_command="python3 \
|
server_command="vllm serve $model \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
-tp $tp \
|
-tp $tp \
|
||||||
--model $model \
|
|
||||||
--port $port \
|
--port $port \
|
||||||
$server_args"
|
$server_args"
|
||||||
else
|
else
|
||||||
echo "Key 'fp8' does not exist in common params."
|
echo "Key 'fp8' does not exist in common params."
|
||||||
server_command="python3 \
|
server_command="vllm serve $model \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
-tp $tp \
|
-tp $tp \
|
||||||
--model $model \
|
|
||||||
--port $port \
|
--port $port \
|
||||||
$server_args"
|
$server_args"
|
||||||
fi
|
fi
|
||||||
|
|||||||
@ -382,7 +382,7 @@ run_genai_perf_tests() {
|
|||||||
client_command="genai-perf profile \
|
client_command="genai-perf profile \
|
||||||
-m $model \
|
-m $model \
|
||||||
--service-kind openai \
|
--service-kind openai \
|
||||||
--backend vllm \
|
--backend "$backend" \
|
||||||
--endpoint-type chat \
|
--endpoint-type chat \
|
||||||
--streaming \
|
--streaming \
|
||||||
--url localhost:$port \
|
--url localhost:$port \
|
||||||
|
|||||||
@ -365,8 +365,7 @@ run_serving_tests() {
|
|||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
server_command="$server_envs python3 \
|
server_command="$server_envs vllm serve \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
$server_args"
|
$server_args"
|
||||||
|
|
||||||
# run the server
|
# run the server
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
[
|
[
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
"test_name": "serving_llama8B_bf16_tp1_sharegpt",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
@ -32,7 +32,7 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
"test_name": "serving_llama8B_bf16_tp2_sharegpt",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
@ -64,7 +64,7 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp4_sharegpt",
|
"test_name": "serving_llama8B_bf16_tp4_sharegpt",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
@ -96,7 +96,7 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp1_random_128_128",
|
"test_name": "serving_llama8B_bf16_tp1_random_128_128",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
@ -131,7 +131,7 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp2_random_128_128",
|
"test_name": "serving_llama8B_bf16_tp2_random_128_128",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
@ -166,7 +166,7 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp4_random_128_128",
|
"test_name": "serving_llama8B_bf16_tp4_random_128_128",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
@ -198,5 +198,413 @@
|
|||||||
"random-output-len": 128,
|
"random-output-len": 128,
|
||||||
"num_prompts": 1000
|
"num_prompts": 1000
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_tp1_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_tp2_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_tp4_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_tp1_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_tp2_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_tp4_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_tp1_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"quantization": "awq",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_tp2_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"quantization": "awq",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_tp4_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"quantization": "awq",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_tp1_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"quantization": "awq",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_tp2_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"quantization": "awq",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_tp4_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"quantization": "awq",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|||||||
@ -1,6 +1,6 @@
|
|||||||
[
|
[
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_pp1_sharegpt",
|
"test_name": "serving_llama8B_bf16_pp1_sharegpt",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
@ -32,7 +32,39 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_pp3_sharegpt",
|
"test_name": "serving_llama8B_bf16_tp2_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_bf16_pp3_sharegpt",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
@ -64,7 +96,7 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp2pp3_sharegpt",
|
"test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
@ -97,7 +129,7 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_pp1_random_128_128",
|
"test_name": "serving_llama8B_bf16_pp1_random_128_128",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
@ -132,7 +164,42 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_pp3_random_128_128",
|
"test_name": "serving_llama8B_bf16_tp2_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_bf16_pp3_random_128_128",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
@ -167,7 +234,7 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp2pp3_random_128_128",
|
"test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
@ -201,5 +268,553 @@
|
|||||||
"ignore-eos": "",
|
"ignore-eos": "",
|
||||||
"num_prompts": 1000
|
"num_prompts": 1000
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_pp1_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"pipeline_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_tp2_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_pp3_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"pipeline_parallel_size": 3,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"pipeline_parallel_size": 3,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_pp1_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"pipeline_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_tp2_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_pp3_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"pipeline_parallel_size": 3,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"pipeline_parallel_size": 3,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_pp1_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"quantization": "awq",
|
||||||
|
"pipeline_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_tp2_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"quantization": "awq",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_pp3_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"quantization": "awq",
|
||||||
|
"pipeline_parallel_size": 3,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"quantization": "awq",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"pipeline_parallel_size": 3,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_pp1_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"quantization": "awq",
|
||||||
|
"pipeline_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_tp2_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"quantization": "awq",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_pp3_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"quantization": "awq",
|
||||||
|
"pipeline_parallel_size": 3,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"quantization": "awq",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"pipeline_parallel_size": 3,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|||||||
@ -1,21 +1,22 @@
|
|||||||
steps:
|
steps:
|
||||||
# aarch64 + CUDA builds
|
# aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
|
||||||
- label: "Build arm64 wheel - CUDA 12.8"
|
- label: "Build arm64 wheel - CUDA 12.9"
|
||||||
id: build-wheel-arm64-cuda-12-8
|
depends_on: ~
|
||||||
|
id: build-wheel-arm64-cuda-12-9
|
||||||
agents:
|
agents:
|
||||||
queue: arm64_cpu_queue_postmerge
|
queue: arm64_cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
# x86 + CUDA builds
|
|
||||||
- label: "Build wheel - CUDA 12.8"
|
- label: "Build wheel - CUDA 12.8"
|
||||||
|
depends_on: ~
|
||||||
id: build-wheel-cuda-12-8
|
id: build-wheel-cuda-12-8
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
@ -28,6 +29,7 @@ steps:
|
|||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build wheel - CUDA 12.6"
|
- label: "Build wheel - CUDA 12.6"
|
||||||
|
depends_on: ~
|
||||||
id: build-wheel-cuda-12-6
|
id: build-wheel-cuda-12-6
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
@ -39,44 +41,61 @@ steps:
|
|||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
# Note(simon): We can always build CUDA 11.8 wheel to ensure the build is working.
|
# x86 + CUDA builds
|
||||||
# However, this block can be uncommented to save some compute hours.
|
- label: "Build wheel - CUDA 12.9"
|
||||||
# - block: "Build CUDA 11.8 wheel"
|
depends_on: ~
|
||||||
# key: block-build-cu118-wheel
|
id: build-wheel-cuda-12-9
|
||||||
|
|
||||||
- label: "Build wheel - CUDA 11.8"
|
|
||||||
# depends_on: block-build-cu118-wheel
|
|
||||||
id: build-wheel-cuda-11-8
|
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- block: "Build release image"
|
- label: "Build release image (x86)"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
key: block-release-image-build
|
id: build-release-image-x86
|
||||||
|
|
||||||
- label: "Build release image"
|
|
||||||
depends_on: block-release-image-build
|
|
||||||
id: build-release-image
|
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||||
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||||
|
# re-tag to default image tag and push, just in case arm64 build fails
|
||||||
|
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
|
# PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
|
||||||
|
- label: "Build release image (arm64)"
|
||||||
|
depends_on: ~
|
||||||
|
id: build-release-image-arm64
|
||||||
|
agents:
|
||||||
|
queue: arm64_cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||||
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||||
|
|
||||||
|
# Add job to create multi-arch manifest
|
||||||
|
- label: "Create multi-arch manifest"
|
||||||
|
depends_on:
|
||||||
|
- build-release-image-x86
|
||||||
|
- build-release-image-arm64
|
||||||
|
id: create-multi-arch-manifest
|
||||||
|
agents:
|
||||||
|
queue: cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
|
- "docker manifest create public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 --amend"
|
||||||
|
- "docker manifest push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
- label: "Annotate release workflow"
|
- label: "Annotate release workflow"
|
||||||
depends_on:
|
depends_on:
|
||||||
- build-release-image
|
- create-multi-arch-manifest
|
||||||
- build-wheel-cuda-12-8
|
- build-wheel-cuda-12-8
|
||||||
- build-wheel-cuda-12-6
|
|
||||||
- build-wheel-cuda-11-8
|
|
||||||
id: annotate-release-workflow
|
id: annotate-release-workflow
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
@ -123,18 +142,24 @@ steps:
|
|||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- block: "Build Neuron release image"
|
- label: "Build and publish nightly multi-arch image to DockerHub"
|
||||||
key: block-neuron-release-image-build
|
depends_on:
|
||||||
depends_on: ~
|
- create-multi-arch-manifest
|
||||||
|
if: build.env("NIGHTLY") == "1"
|
||||||
- label: "Build and publish Neuron release image"
|
|
||||||
depends_on: block-neuron-release-image-build
|
|
||||||
agents:
|
agents:
|
||||||
queue: neuron-postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
|
- "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
|
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
|
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
|
||||||
|
- "docker push vllm/vllm-openai:nightly"
|
||||||
|
- "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
|
||||||
|
# Clean up old nightly builds (keep only last 14)
|
||||||
|
- "bash .buildkite/scripts/cleanup-nightly-builds.sh"
|
||||||
|
plugins:
|
||||||
|
- docker-login#v3.0.0:
|
||||||
|
username: vllmbot
|
||||||
|
password-env: DOCKERHUB_TOKEN
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|||||||
@ -14,18 +14,33 @@ buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
|
|||||||
To download the wheel:
|
To download the wheel:
|
||||||
\`\`\`
|
\`\`\`
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
|
||||||
|
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
\`\`\`
|
\`\`\`
|
||||||
|
|
||||||
To download and upload the image:
|
To download and upload the image:
|
||||||
|
|
||||||
\`\`\`
|
\`\`\`
|
||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
|
||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
|
||||||
docker tag vllm/vllm-openai vllm/vllm-openai:latest
|
|
||||||
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
|
||||||
docker push vllm/vllm-openai:latest
|
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
|
||||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}
|
docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
||||||
|
docker push vllm/vllm-openai:latest-x86_64
|
||||||
|
docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
|
||||||
|
|
||||||
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
|
||||||
|
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
|
||||||
|
docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||||
|
docker push vllm/vllm-openai:latest-aarch64
|
||||||
|
docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
|
||||||
|
|
||||||
|
docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
|
||||||
|
docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
|
||||||
|
docker manifest push vllm/vllm-openai:latest
|
||||||
|
docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
|
||||||
\`\`\`
|
\`\`\`
|
||||||
EOF
|
EOF
|
||||||
97
.buildkite/scripts/cleanup-nightly-builds.sh
Executable file
97
.buildkite/scripts/cleanup-nightly-builds.sh
Executable file
@ -0,0 +1,97 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
|
||||||
|
# This script uses DockerHub API to list and delete old tags with "nightly-" prefix
|
||||||
|
|
||||||
|
# DockerHub API endpoint for vllm/vllm-openai repository
|
||||||
|
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
|
||||||
|
|
||||||
|
# Get DockerHub token from environment
|
||||||
|
if [ -z "$DOCKERHUB_TOKEN" ]; then
|
||||||
|
echo "Error: DOCKERHUB_TOKEN environment variable is not set"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Function to get all tags from DockerHub
|
||||||
|
get_all_tags() {
|
||||||
|
local page=1
|
||||||
|
local all_tags=""
|
||||||
|
|
||||||
|
while true; do
|
||||||
|
local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
|
||||||
|
"$REPO_API_URL?page=$page&page_size=100")
|
||||||
|
|
||||||
|
# Get both last_updated timestamp and tag name, separated by |
|
||||||
|
local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
|
||||||
|
|
||||||
|
if [ -z "$tags" ]; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
|
||||||
|
all_tags="$all_tags$tags"$'\n'
|
||||||
|
page=$((page + 1))
|
||||||
|
done
|
||||||
|
|
||||||
|
# Sort by timestamp (newest first) and extract just the tag names
|
||||||
|
echo "$all_tags" | sort -r | cut -d'|' -f2
|
||||||
|
}
|
||||||
|
|
||||||
|
delete_tag() {
|
||||||
|
local tag_name="$1"
|
||||||
|
echo "Deleting tag: $tag_name"
|
||||||
|
|
||||||
|
local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
|
||||||
|
local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
|
||||||
|
|
||||||
|
if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
|
||||||
|
echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
|
||||||
|
else
|
||||||
|
echo "Successfully deleted tag: $tag_name"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Get all nightly- prefixed tags, sorted by last_updated timestamp (newest first)
|
||||||
|
echo "Fetching all tags from DockerHub..."
|
||||||
|
all_tags=$(get_all_tags)
|
||||||
|
|
||||||
|
if [ -z "$all_tags" ]; then
|
||||||
|
echo "No tags found to clean up"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Count total tags
|
||||||
|
total_tags=$(echo "$all_tags" | wc -l)
|
||||||
|
echo "Found $total_tags tags"
|
||||||
|
|
||||||
|
# Keep only the last 14 builds (including the current one)
|
||||||
|
tags_to_keep=14
|
||||||
|
tags_to_delete=$((total_tags - tags_to_keep))
|
||||||
|
|
||||||
|
if [ $tags_to_delete -le 0 ]; then
|
||||||
|
echo "No tags need to be deleted (only $total_tags tags found, keeping $tags_to_keep)"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Will delete $tags_to_delete old tags, keeping the newest $tags_to_keep"
|
||||||
|
|
||||||
|
# Get tags to delete (skip the first $tags_to_keep tags)
|
||||||
|
tags_to_delete_list=$(echo "$all_tags" | tail -n +$((tags_to_keep + 1)))
|
||||||
|
|
||||||
|
if [ -z "$tags_to_delete_list" ]; then
|
||||||
|
echo "No tags to delete"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Delete old tags
|
||||||
|
echo "Deleting old tags..."
|
||||||
|
while IFS= read -r tag; do
|
||||||
|
if [ -n "$tag" ]; then
|
||||||
|
delete_tag "$tag"
|
||||||
|
# Add a small delay to avoid rate limiting
|
||||||
|
sleep 1
|
||||||
|
fi
|
||||||
|
done <<< "$tags_to_delete_list"
|
||||||
|
|
||||||
|
echo "Cleanup completed successfully"
|
||||||
@ -86,10 +86,6 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
|
|||||||
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
|
|
||||||
commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
|
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
|
||||||
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
||||||
fi
|
fi
|
||||||
@ -164,16 +160,9 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
|
|||||||
--ignore=entrypoints/llm/test_chat.py \
|
--ignore=entrypoints/llm/test_chat.py \
|
||||||
--ignore=entrypoints/llm/test_accuracy.py \
|
--ignore=entrypoints/llm/test_accuracy.py \
|
||||||
--ignore=entrypoints/llm/test_init.py \
|
--ignore=entrypoints/llm/test_init.py \
|
||||||
--ignore=entrypoints/llm/test_generate_multiple_loras.py \
|
|
||||||
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
--ignore=entrypoints/llm/test_prompt_validation.py "}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
#Obsolete currently
|
|
||||||
##ignore certain Entrypoints/llm tests
|
|
||||||
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
|
|
||||||
# commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
|
|
||||||
#fi
|
|
||||||
|
|
||||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||||
# --ignore=entrypoints/openai/test_embedding.py \
|
# --ignore=entrypoints/openai/test_embedding.py \
|
||||||
# --ignore=entrypoints/openai/test_oot_registration.py
|
# --ignore=entrypoints/openai/test_oot_registration.py
|
||||||
|
|||||||
@ -25,8 +25,8 @@ numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE
|
|||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||||
|
|
||||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
||||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
|
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=16 --env VLLM_CPU_CI_ENV=1 -e E2E_OMP_THREADS="$OMP_CORE_RANGE" --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
|
||||||
|
|
||||||
function cpu_tests() {
|
function cpu_tests() {
|
||||||
set -e
|
set -e
|
||||||
@ -46,57 +46,74 @@ function cpu_tests() {
|
|||||||
set -e
|
set -e
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||||
|
|
||||||
|
# Run kernel tests
|
||||||
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
|
set -e
|
||||||
|
pytest -x -v -s tests/kernels/test_onednn.py"
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
# Note: disable until supports V1
|
# Note: disable until supports V1
|
||||||
# pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
# pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
||||||
# pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
# pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
||||||
|
|
||||||
# Note: disable Bart until supports V1
|
pytest -x -v -s tests/models/language/generation -m cpu_model
|
||||||
pytest -v -s tests/models/language/generation -m cpu_model \
|
VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
|
||||||
--ignore=tests/models/language/generation/test_bart.py
|
|
||||||
VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
|
|
||||||
--ignore=tests/models/language/generation/test_bart.py
|
|
||||||
|
|
||||||
pytest -v -s tests/models/language/pooling -m cpu_model
|
pytest -x -v -s tests/models/language/pooling -m cpu_model
|
||||||
pytest -v -s tests/models/multimodal/generation \
|
pytest -x -v -s tests/models/multimodal/generation \
|
||||||
--ignore=tests/models/multimodal/generation/test_mllama.py \
|
|
||||||
--ignore=tests/models/multimodal/generation/test_pixtral.py \
|
--ignore=tests/models/multimodal/generation/test_pixtral.py \
|
||||||
-m cpu_model"
|
-m cpu_model"
|
||||||
|
|
||||||
# Run compressed-tensor test
|
# Run compressed-tensor test
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -s -v \
|
pytest -x -s -v \
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
|
||||||
|
|
||||||
# Note: disable it until supports V1
|
# Note: disable it until supports V1
|
||||||
# Run AWQ test
|
# Run AWQ test
|
||||||
# docker exec cpu-test-"$NUMA_NODE" bash -c "
|
# docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
# set -e
|
# set -e
|
||||||
# VLLM_USE_V1=0 pytest -s -v \
|
# VLLM_USE_V1=0 pytest -x -s -v \
|
||||||
# tests/quantization/test_ipex_quant.py"
|
# tests/quantization/test_ipex_quant.py"
|
||||||
|
|
||||||
# Run multi-lora tests
|
# Run multi-lora tests
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -s -v \
|
pytest -x -s -v \
|
||||||
tests/lora/test_qwen2vl.py"
|
tests/lora/test_qwen2vl.py"
|
||||||
|
|
||||||
# online serving
|
# online serving: tp+pp
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c '
|
docker exec cpu-test-"$NUMA_NODE" bash -c '
|
||||||
set -e
|
set -e
|
||||||
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
|
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -pp=2 &
|
||||||
|
server_pid=$!
|
||||||
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
||||||
vllm bench serve \
|
vllm bench serve \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--dataset-name random \
|
--dataset-name random \
|
||||||
--model meta-llama/Llama-3.2-3B-Instruct \
|
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||||
--num-prompts 20 \
|
--num-prompts 20 \
|
||||||
--endpoint /v1/completions'
|
--endpoint /v1/completions
|
||||||
|
kill -s SIGTERM $server_pid &'
|
||||||
|
|
||||||
|
# online serving: tp+dp
|
||||||
|
docker exec cpu-test-"$NUMA_NODE" bash -c '
|
||||||
|
set -e
|
||||||
|
VLLM_CPU_OMP_THREADS_BIND=$E2E_OMP_THREADS VLLM_CPU_SGL_KERNEL=1 vllm serve meta-llama/Llama-3.2-3B-Instruct -tp=2 -dp=2 &
|
||||||
|
server_pid=$!
|
||||||
|
timeout 600 bash -c "until curl localhost:8000/v1/models; do sleep 1; done" || exit 1
|
||||||
|
vllm bench serve \
|
||||||
|
--backend vllm \
|
||||||
|
--dataset-name random \
|
||||||
|
--model meta-llama/Llama-3.2-3B-Instruct \
|
||||||
|
--num-prompts 20 \
|
||||||
|
--endpoint /v1/completions
|
||||||
|
kill -s SIGTERM $server_pid &'
|
||||||
}
|
}
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
# All of CPU tests are expected to be finished less than 40 mins.
|
||||||
export -f cpu_tests
|
export -f cpu_tests
|
||||||
timeout 1.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
timeout 2h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
||||||
|
|||||||
@ -1,64 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the Neuron docker image and run the API server inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -e
|
|
||||||
set -v
|
|
||||||
|
|
||||||
image_name="neuron/vllm-ci"
|
|
||||||
container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
|
||||||
|
|
||||||
HF_CACHE="$(realpath ~)/huggingface"
|
|
||||||
mkdir -p "${HF_CACHE}"
|
|
||||||
HF_MOUNT="/root/.cache/huggingface"
|
|
||||||
HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
|
|
||||||
|
|
||||||
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
|
|
||||||
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
|
|
||||||
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
|
|
||||||
|
|
||||||
# prune old image and containers to save disk space, and only once a day
|
|
||||||
# by using a timestamp file in tmp.
|
|
||||||
if [ -f /tmp/neuron-docker-build-timestamp ]; then
|
|
||||||
last_build=$(cat /tmp/neuron-docker-build-timestamp)
|
|
||||||
current_time=$(date +%s)
|
|
||||||
if [ $((current_time - last_build)) -gt 86400 ]; then
|
|
||||||
# Remove dangling images (those that are not tagged and not used by any container)
|
|
||||||
docker image prune -f
|
|
||||||
# Remove unused volumes / force the system prune for old images as well.
|
|
||||||
docker volume prune -f && docker system prune -f
|
|
||||||
echo "$current_time" > /tmp/neuron-docker-build-timestamp
|
|
||||||
fi
|
|
||||||
else
|
|
||||||
date "+%s" > /tmp/neuron-docker-build-timestamp
|
|
||||||
fi
|
|
||||||
|
|
||||||
docker build -t "${image_name}" -f docker/Dockerfile.neuron .
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
remove_docker_container() {
|
|
||||||
docker image rm -f "${image_name}" || true;
|
|
||||||
}
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
|
|
||||||
# Run the image
|
|
||||||
docker run --rm -it --device=/dev/neuron0 --network bridge \
|
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
|
||||||
-e "HF_TOKEN=${HF_TOKEN}" \
|
|
||||||
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
|
|
||||||
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
|
||||||
--name "${container_name}" \
|
|
||||||
${image_name} \
|
|
||||||
/bin/bash -c "
|
|
||||||
set -e; # Exit on first error
|
|
||||||
python3 /workspace/vllm/examples/offline_inference/neuron.py;
|
|
||||||
python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
|
|
||||||
for f in /workspace/vllm/tests/neuron/2_core/*.py; do
|
|
||||||
echo \"Running test file: \$f\";
|
|
||||||
python3 -m pytest \$f -v --capture=tee-sys;
|
|
||||||
done
|
|
||||||
"
|
|
||||||
191
.buildkite/scripts/hardware_ci/run-npu-test.sh
Normal file
191
.buildkite/scripts/hardware_ci/run-npu-test.sh
Normal file
@ -0,0 +1,191 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script build the Ascend NPU docker image and run the offline inference inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Base ubuntu image with basic ascend development libraries and python installed
|
||||||
|
VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
|
||||||
|
CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
|
||||||
|
TEST_RUN_CONFIG_FILE="vllm_test.cfg"
|
||||||
|
VLLM_ASCEND_TMP_DIR=
|
||||||
|
# Get the test run configuration file from the vllm-ascend repository
|
||||||
|
fetch_vllm_test_cfg() {
|
||||||
|
VLLM_ASCEND_TMP_DIR=$(mktemp -d)
|
||||||
|
# Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
|
||||||
|
cleanup() {
|
||||||
|
rm -rf "${VLLM_ASCEND_TMP_DIR}"
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
|
||||||
|
if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
|
||||||
|
echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# If the file already exists locally, just overwrite it
|
||||||
|
cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
|
||||||
|
echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
|
||||||
|
|
||||||
|
# Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
|
||||||
|
# when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
|
||||||
|
rm -rf "${VLLM_ASCEND_TMP_DIR}"
|
||||||
|
trap - EXIT
|
||||||
|
}
|
||||||
|
|
||||||
|
# Downloads test run configuration file from a remote URL.
|
||||||
|
# Loads the configuration into the current script environment.
|
||||||
|
get_config() {
|
||||||
|
if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
|
||||||
|
echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
source "${TEST_RUN_CONFIG_FILE}"
|
||||||
|
echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
# get test running configuration.
|
||||||
|
fetch_vllm_test_cfg
|
||||||
|
get_config
|
||||||
|
# Check if the function call was successful. If not, exit the script.
|
||||||
|
if [ $? -ne 0 ]; then
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
|
||||||
|
container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||||
|
|
||||||
|
# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
|
||||||
|
agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
|
||||||
|
echo "agent_idx: ${agent_idx}"
|
||||||
|
builder_name="cachebuilder${agent_idx}"
|
||||||
|
builder_cache_dir="/mnt/docker-cache${agent_idx}"
|
||||||
|
mkdir -p ${builder_cache_dir}
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
cat <<EOF | DOCKER_BUILDKIT=1 docker build \
|
||||||
|
--add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
|
||||||
|
--builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
|
||||||
|
--cache-to type=local,dest=${builder_cache_dir},mode=max \
|
||||||
|
--progress=plain --load -t ${image_name} -f - .
|
||||||
|
FROM ${BASE_IMAGE_NAME}
|
||||||
|
|
||||||
|
# Define environments
|
||||||
|
ENV DEBIAN_FRONTEND=noninteractive
|
||||||
|
|
||||||
|
RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
|
||||||
|
pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
|
||||||
|
apt-get update -y && \
|
||||||
|
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
|
||||||
|
rm -rf /var/cache/apt/* && \
|
||||||
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install for pytest to make the docker build cache layer always valid
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install pytest>=6.0 modelscope
|
||||||
|
|
||||||
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
|
# Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
|
||||||
|
COPY requirements/common.txt /workspace/vllm/requirements/common.txt
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install -r requirements/common.txt
|
||||||
|
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Install vLLM
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||||
|
python3 -m pip uninstall -y triton
|
||||||
|
|
||||||
|
# Install vllm-ascend
|
||||||
|
WORKDIR /workspace
|
||||||
|
ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
|
||||||
|
ARG VLLM_ASCEND_TAG=main
|
||||||
|
RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
|
||||||
|
git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
|
||||||
|
|
||||||
|
# Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install -r /workspace/vllm-ascend/requirements.txt
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
|
||||||
|
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
||||||
|
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
||||||
|
export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
||||||
|
python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
|
||||||
|
|
||||||
|
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
ENV VLLM_USE_MODELSCOPE=True
|
||||||
|
|
||||||
|
WORKDIR /workspace/vllm-ascend
|
||||||
|
|
||||||
|
CMD ["/bin/bash"]
|
||||||
|
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() {
|
||||||
|
docker rm -f "${container_name}" || true;
|
||||||
|
docker image rm -f "${image_name}" || true;
|
||||||
|
docker system prune -f || true;
|
||||||
|
}
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
|
||||||
|
# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
|
||||||
|
# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
|
||||||
|
# e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
|
||||||
|
# returns --device /dev/davinci0 --device /dev/davinci1
|
||||||
|
parse_and_gen_devices() {
|
||||||
|
local input="$1"
|
||||||
|
local index cards_num
|
||||||
|
if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
|
||||||
|
index="${BASH_REMATCH[1]}"
|
||||||
|
cards_num="${BASH_REMATCH[2]}"
|
||||||
|
else
|
||||||
|
echo "parse error" >&2
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
local devices=""
|
||||||
|
local i=0
|
||||||
|
while (( i < cards_num )); do
|
||||||
|
local dev_idx=$(((index - 1)*cards_num + i ))
|
||||||
|
devices="$devices --device /dev/davinci${dev_idx}"
|
||||||
|
((i++))
|
||||||
|
done
|
||||||
|
|
||||||
|
# trim leading space
|
||||||
|
devices="${devices#"${devices%%[![:space:]]*}"}"
|
||||||
|
# Output devices: assigned to the caller variable
|
||||||
|
printf '%s' "$devices"
|
||||||
|
}
|
||||||
|
|
||||||
|
devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
|
||||||
|
|
||||||
|
# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
|
||||||
|
# This test checks whether the OOT platform interface is functioning properly in conjunction with
|
||||||
|
# the hardware plugin vllm-ascend.
|
||||||
|
model_cache_dir=/mnt/modelscope${agent_idx}
|
||||||
|
mkdir -p ${model_cache_dir}
|
||||||
|
docker run \
|
||||||
|
${devices} \
|
||||||
|
--device /dev/davinci_manager \
|
||||||
|
--device /dev/devmm_svm \
|
||||||
|
--device /dev/hisi_hdc \
|
||||||
|
-v /usr/local/dcmi:/usr/local/dcmi \
|
||||||
|
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
|
||||||
|
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
|
||||||
|
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
|
||||||
|
-v /etc/ascend_install.info:/etc/ascend_install.info \
|
||||||
|
-v ${model_cache_dir}:/root/.cache/modelscope \
|
||||||
|
--entrypoint="" \
|
||||||
|
--name "${container_name}" \
|
||||||
|
"${image_name}" \
|
||||||
|
bash -c '
|
||||||
|
set -e
|
||||||
|
pytest -v -s tests/e2e/vllm_interface/
|
||||||
|
'
|
||||||
@ -61,8 +61,8 @@ echo "Results will be stored in: $RESULTS_DIR"
|
|||||||
echo "--- Installing Python dependencies ---"
|
echo "--- Installing Python dependencies ---"
|
||||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||||
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
|
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
||||||
&& python3 -m pip install --progress-bar off hf-transfer
|
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
||||||
echo "--- Python dependencies installed ---"
|
echo "--- Python dependencies installed ---"
|
||||||
export VLLM_USE_V1=1
|
export VLLM_USE_V1=1
|
||||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||||
|
|||||||
@ -61,8 +61,8 @@ echo "Results will be stored in: $RESULTS_DIR"
|
|||||||
echo "--- Installing Python dependencies ---"
|
echo "--- Installing Python dependencies ---"
|
||||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||||
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4 \
|
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
||||||
&& python3 -m pip install --progress-bar off hf-transfer
|
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
||||||
echo "--- Python dependencies installed ---"
|
echo "--- Python dependencies installed ---"
|
||||||
export VLLM_USE_V1=1
|
export VLLM_USE_V1=1
|
||||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||||
|
|||||||
@ -23,21 +23,27 @@ docker run \
|
|||||||
--device /dev/dri \
|
--device /dev/dri \
|
||||||
-v /dev/dri/by-path:/dev/dri/by-path \
|
-v /dev/dri/by-path:/dev/dri/by-path \
|
||||||
--entrypoint="" \
|
--entrypoint="" \
|
||||||
|
-e "HF_TOKEN=${HF_TOKEN}" \
|
||||||
|
-e "ZE_AFFINITY_MASK=${ZE_AFFINITY_MASK}" \
|
||||||
--name "${container_name}" \
|
--name "${container_name}" \
|
||||||
"${image_name}" \
|
"${image_name}" \
|
||||||
sh -c '
|
bash -c '
|
||||||
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
set -e
|
||||||
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
echo $ZE_AFFINITY_MASK
|
||||||
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
pip install tblib==3.1.0
|
||||||
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||||
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
|
||||||
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||||
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||||
|
VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||||
cd tests
|
cd tests
|
||||||
pytest -v -s v1/core
|
pytest -v -s v1/core
|
||||||
pytest -v -s v1/engine
|
pytest -v -s v1/engine
|
||||||
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
||||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||||
pytest -v -s v1/structured_output
|
pytest -v -s v1/structured_output
|
||||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py
|
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
|
||||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py
|
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
||||||
|
pytest -v -s v1/test_metrics
|
||||||
pytest -v -s v1/test_serial_utils.py
|
pytest -v -s v1/test_serial_utils.py
|
||||||
pytest -v -s v1/test_utils.py
|
|
||||||
pytest -v -s v1/test_metrics_reader.py
|
|
||||||
'
|
'
|
||||||
|
|||||||
@ -18,7 +18,7 @@ vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_
|
|||||||
bench_throughput_exit_code=$?
|
bench_throughput_exit_code=$?
|
||||||
|
|
||||||
# run server-based benchmarks and upload the result to buildkite
|
# run server-based benchmarks and upload the result to buildkite
|
||||||
python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
|
vllm serve meta-llama/Llama-2-7b-chat-hf &
|
||||||
server_pid=$!
|
server_pid=$!
|
||||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
|
||||||
|
|||||||
59
.buildkite/scripts/run-prime-rl-test.sh
Executable file
59
.buildkite/scripts/run-prime-rl-test.sh
Executable file
@ -0,0 +1,59 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
# Setup script for Prime-RL integration tests
|
||||||
|
# This script prepares the environment for running Prime-RL tests with nightly vLLM
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||||
|
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
|
||||||
|
PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
|
||||||
|
PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
|
||||||
|
|
||||||
|
echo "Setting up Prime-RL integration test environment..."
|
||||||
|
|
||||||
|
# Clean up any existing Prime-RL directory
|
||||||
|
if [ -d "${PRIME_RL_DIR}" ]; then
|
||||||
|
echo "Removing existing Prime-RL directory..."
|
||||||
|
rm -rf "${PRIME_RL_DIR}"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Install UV if not available
|
||||||
|
if ! command -v uv &> /dev/null; then
|
||||||
|
echo "Installing UV package manager..."
|
||||||
|
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||||
|
source $HOME/.local/bin/env
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Clone Prime-RL repository at specific branch for reproducible tests
|
||||||
|
PRIME_RL_BRANCH="integ-vllm-main"
|
||||||
|
echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
|
||||||
|
git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
|
||||||
|
cd "${PRIME_RL_DIR}"
|
||||||
|
|
||||||
|
echo "Setting up UV project environment..."
|
||||||
|
export UV_PROJECT_ENVIRONMENT=/usr/local
|
||||||
|
ln -s /usr/bin/python3 /usr/local/bin/python
|
||||||
|
|
||||||
|
# Remove vllm pin from pyproject.toml
|
||||||
|
echo "Removing vllm pin from pyproject.toml..."
|
||||||
|
sed -i '/vllm==/d' pyproject.toml
|
||||||
|
|
||||||
|
# Sync Prime-RL dependencies
|
||||||
|
echo "Installing Prime-RL dependencies..."
|
||||||
|
uv sync --inexact && uv sync --inexact --all-extras
|
||||||
|
|
||||||
|
# Verify installation
|
||||||
|
echo "Verifying installations..."
|
||||||
|
uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
|
||||||
|
uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
|
||||||
|
|
||||||
|
echo "Prime-RL integration test environment setup complete!"
|
||||||
|
|
||||||
|
echo "Running Prime-RL integration tests..."
|
||||||
|
export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
|
||||||
|
uv run pytest -vs tests/integration/test_rl.py -m gpu
|
||||||
|
|
||||||
|
echo "Prime-RL integration tests completed!"
|
||||||
@ -17,7 +17,7 @@ if [ "$disk_usage" -gt "$threshold" ]; then
|
|||||||
# Remove dangling images (those that are not tagged and not used by any container)
|
# Remove dangling images (those that are not tagged and not used by any container)
|
||||||
docker image prune -f
|
docker image prune -f
|
||||||
# Remove unused volumes / force the system prune for old images as well.
|
# Remove unused volumes / force the system prune for old images as well.
|
||||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
docker volume prune -f && docker system prune --force --filter "until=24h" --all
|
||||||
echo "Docker images and volumes cleanup completed."
|
echo "Docker images and volumes cleanup completed."
|
||||||
else
|
else
|
||||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||||
|
|||||||
@ -14,8 +14,19 @@ fi
|
|||||||
# Get the single wheel file
|
# Get the single wheel file
|
||||||
wheel="${wheel_files[0]}"
|
wheel="${wheel_files[0]}"
|
||||||
|
|
||||||
# Rename 'linux' to 'manylinux1' in the wheel filename
|
# Detect architecture and rename 'linux' to appropriate manylinux version
|
||||||
new_wheel="${wheel/linux/manylinux1}"
|
arch=$(uname -m)
|
||||||
|
if [[ $arch == "x86_64" ]]; then
|
||||||
|
manylinux_version="manylinux1"
|
||||||
|
elif [[ $arch == "aarch64" ]]; then
|
||||||
|
manylinux_version="manylinux2014"
|
||||||
|
else
|
||||||
|
echo "Warning: Unknown architecture $arch, using manylinux1 as default"
|
||||||
|
manylinux_version="manylinux1"
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Rename 'linux' to the appropriate manylinux version in the wheel filename
|
||||||
|
new_wheel="${wheel/linux/$manylinux_version}"
|
||||||
mv -- "$wheel" "$new_wheel"
|
mv -- "$wheel" "$new_wheel"
|
||||||
wheel="$new_wheel"
|
wheel="$new_wheel"
|
||||||
|
|
||||||
@ -47,14 +58,15 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
|
|||||||
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||||
|
|
||||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
if [[ $normal_wheel == *"cu126"* ]]; then
|
||||||
# if $normal_wheel matches cu118, do not upload the index.html
|
|
||||||
echo "Skipping index files for cu118 wheels"
|
|
||||||
elif [[ $normal_wheel == *"cu126"* ]]; then
|
|
||||||
# if $normal_wheel matches cu126, do not upload the index.html
|
# if $normal_wheel matches cu126, do not upload the index.html
|
||||||
echo "Skipping index files for cu126 wheels"
|
echo "Skipping index files for cu126 wheels"
|
||||||
|
elif [[ $normal_wheel == *"cu128"* ]]; then
|
||||||
|
# if $normal_wheel matches cu128, do not upload the index.html
|
||||||
|
echo "Skipping index files for cu128 wheels"
|
||||||
else
|
else
|
||||||
# only upload index.html for cu128 wheels (default wheels)
|
# only upload index.html for cu129 wheels (default wheels) as it
|
||||||
|
# is available on both x86 and arm64
|
||||||
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||||
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||||
fi
|
fi
|
||||||
@ -63,14 +75,15 @@ fi
|
|||||||
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
||||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
||||||
|
|
||||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
if [[ $normal_wheel == *"cu126"* ]]; then
|
||||||
# if $normal_wheel matches cu118, do not upload the index.html
|
|
||||||
echo "Skipping index files for cu118 wheels"
|
|
||||||
elif [[ $normal_wheel == *"cu126"* ]]; then
|
|
||||||
# if $normal_wheel matches cu126, do not upload the index.html
|
# if $normal_wheel matches cu126, do not upload the index.html
|
||||||
echo "Skipping index files for cu126 wheels"
|
echo "Skipping index files for cu126 wheels"
|
||||||
|
elif [[ $normal_wheel == *"cu128"* ]]; then
|
||||||
|
# if $normal_wheel matches cu128, do not upload the index.html
|
||||||
|
echo "Skipping index files for cu128 wheels"
|
||||||
else
|
else
|
||||||
# only upload index.html for cu128 wheels (default wheels)
|
# only upload index.html for cu129 wheels (default wheels) as it
|
||||||
|
# is available on both x86 and arm64
|
||||||
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
32
.coveragerc
Normal file
32
.coveragerc
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
[run]
|
||||||
|
source = vllm
|
||||||
|
omit =
|
||||||
|
*/tests/*
|
||||||
|
*/test_*
|
||||||
|
*/__pycache__/*
|
||||||
|
*/build/*
|
||||||
|
*/dist/*
|
||||||
|
*/vllm.egg-info/*
|
||||||
|
*/third_party/*
|
||||||
|
*/examples/*
|
||||||
|
*/benchmarks/*
|
||||||
|
*/docs/*
|
||||||
|
|
||||||
|
[report]
|
||||||
|
exclude_lines =
|
||||||
|
pragma: no cover
|
||||||
|
def __repr__
|
||||||
|
if self.debug:
|
||||||
|
if settings.DEBUG
|
||||||
|
raise AssertionError
|
||||||
|
raise NotImplementedError
|
||||||
|
if 0:
|
||||||
|
if __name__ == .__main__.:
|
||||||
|
class .*\bProtocol\):
|
||||||
|
@(abc\.)?abstractmethod
|
||||||
|
|
||||||
|
[html]
|
||||||
|
directory = htmlcov
|
||||||
|
|
||||||
|
[xml]
|
||||||
|
output = coverage.xml
|
||||||
24
.github/.bc-linter.yml
vendored
Normal file
24
.github/.bc-linter.yml
vendored
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
# doc: https://github.com/pytorch/test-infra/blob/main/tools/stronghold/docs/bc_linter_config.md
|
||||||
|
version: 1
|
||||||
|
paths:
|
||||||
|
# We temporarily disable globally, and will only enable with `annotations.include`
|
||||||
|
# include:
|
||||||
|
# - "vllm/v1/attetion/*.py"
|
||||||
|
# - "vllm/v1/core/*.py"
|
||||||
|
exclude:
|
||||||
|
- "**/*.py"
|
||||||
|
|
||||||
|
scan:
|
||||||
|
functions: true # check free functions and methods
|
||||||
|
classes: true # check classes/dataclasses
|
||||||
|
public_only: true # ignore names starting with "_" at any level
|
||||||
|
|
||||||
|
annotations:
|
||||||
|
include: # decorators that force‑include a symbol
|
||||||
|
- name: "bc_linter_include" # matched by simple name or dotted suffix
|
||||||
|
propagate_to_members: false # for classes, include methods/inner classes
|
||||||
|
exclude: # decorators that force‑exclude a symbol
|
||||||
|
- name: "bc_linter_skip" # matched by simple name or dotted suffix
|
||||||
|
propagate_to_members: true # for classes, exclude methods/inner classes
|
||||||
|
|
||||||
|
excluded_violations: [] # e.g. ["ParameterRenamed", "FieldTypeChanged"]
|
||||||
86
.github/CODEOWNERS
vendored
86
.github/CODEOWNERS
vendored
@ -2,20 +2,22 @@
|
|||||||
# for more info about CODEOWNERS file
|
# for more info about CODEOWNERS file
|
||||||
|
|
||||||
# This lists cover the "core" components of vLLM that require careful review
|
# This lists cover the "core" components of vLLM that require careful review
|
||||||
|
/vllm/attention @LucasWilkinson
|
||||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||||
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/model_executor/layers/fused_moe @mgoin
|
||||||
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
|
||||||
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
|
||||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
|
||||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
|
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
|
||||||
/vllm/multimodal @DarkLight1337 @ywang96
|
/vllm/model_executor/layers/mamba @tdoublep
|
||||||
|
/vllm/model_executor/model_loader @22quinn
|
||||||
|
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
|
||||||
/vllm/vllm_flash_attn @LucasWilkinson
|
/vllm/vllm_flash_attn @LucasWilkinson
|
||||||
/vllm/lora @jeejeelee
|
/vllm/lora @jeejeelee
|
||||||
/vllm/reasoning @aarnphm
|
/vllm/reasoning @aarnphm @chaunceyjiang
|
||||||
/vllm/entrypoints @aarnphm
|
/vllm/entrypoints @aarnphm @chaunceyjiang
|
||||||
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
|
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
|
||||||
|
/vllm/distributed/kv_transfer @NickLucche @ApostaC
|
||||||
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||||
|
|
||||||
# Any change to the VllmConfig changes can have a large user-facing impact,
|
# Any change to the VllmConfig changes can have a large user-facing impact,
|
||||||
@ -24,40 +26,63 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
|
|
||||||
# vLLM V1
|
# vLLM V1
|
||||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm
|
/vllm/v1/attention @LucasWilkinson
|
||||||
|
/vllm/v1/attention/backends/flashinfer.py @mgoin
|
||||||
|
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||||
|
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
||||||
|
/vllm/v1/sample @22quinn @houseroad @njhill
|
||||||
|
/vllm/v1/spec_decode @benchislett @luccafong
|
||||||
|
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
|
||||||
|
/vllm/v1/kv_cache_interface.py @heheda12345
|
||||||
|
/vllm/v1/offloading @ApostaC
|
||||||
|
|
||||||
# Test ownership
|
# Test ownership
|
||||||
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
||||||
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
|
|
||||||
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
|
|
||||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
||||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||||
/tests/distributed/test_same_node.py @youkaichao
|
/tests/distributed/test_same_node.py @youkaichao
|
||||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
|
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
|
||||||
/tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
|
/tests/evals @mgoin
|
||||||
|
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
|
||||||
/tests/models @DarkLight1337 @ywang96
|
/tests/models @DarkLight1337 @ywang96
|
||||||
/tests/multimodal @DarkLight1337 @ywang96
|
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
|
||||||
/tests/prefix_caching @comaniac @KuntaiDu
|
|
||||||
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
|
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
|
||||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
||||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
||||||
|
/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
||||||
/tests/weight_loading @mgoin @youkaichao @yewentao256
|
/tests/weight_loading @mgoin @youkaichao @yewentao256
|
||||||
/tests/lora @jeejeelee
|
/tests/lora @jeejeelee
|
||||||
|
/tests/models/language/generation/test_hybrid.py @tdoublep
|
||||||
|
/tests/v1/kv_connector/nixl_integration @NickLucche
|
||||||
|
/tests/v1/kv_connector @ApostaC
|
||||||
|
/tests/v1/offloading @ApostaC
|
||||||
|
|
||||||
|
# Transformers backend
|
||||||
|
/vllm/model_executor/models/transformers.py @hmellor
|
||||||
|
/tests/models/test_transformers.py @hmellor
|
||||||
|
|
||||||
# Docs
|
# Docs
|
||||||
/docs @hmellor
|
/docs/mkdocs @hmellor
|
||||||
|
/docs/**/*.yml @hmellor
|
||||||
|
/requirements/docs.txt @hmellor
|
||||||
|
.readthedocs.yaml @hmellor
|
||||||
mkdocs.yaml @hmellor
|
mkdocs.yaml @hmellor
|
||||||
|
|
||||||
|
# Linting
|
||||||
|
.markdownlint.yaml @hmellor
|
||||||
|
.pre-commit-config.yaml @hmellor
|
||||||
|
/tools/pre_commit @hmellor
|
||||||
|
|
||||||
# CPU
|
# CPU
|
||||||
/vllm/v1/worker/^cpu @bigPYJ1151
|
/vllm/v1/worker/cpu* @bigPYJ1151
|
||||||
/csrc/cpu @bigPYJ1151
|
/csrc/cpu @bigPYJ1151
|
||||||
/vllm/platforms/cpu.py @bigPYJ1151
|
/vllm/platforms/cpu.py @bigPYJ1151
|
||||||
/cmake/cpu_extension.cmake @bigPYJ1151
|
/cmake/cpu_extension.cmake @bigPYJ1151
|
||||||
/docker/Dockerfile.cpu @bigPYJ1151
|
/docker/Dockerfile.cpu @bigPYJ1151
|
||||||
|
|
||||||
# Intel GPU
|
# Intel GPU
|
||||||
/vllm/v1/worker/^xpu @jikunshang
|
/vllm/v1/worker/xpu* @jikunshang
|
||||||
/vllm/platforms/xpu.py @jikunshang
|
/vllm/platforms/xpu.py @jikunshang
|
||||||
/docker/Dockerfile.xpu @jikunshang
|
/docker/Dockerfile.xpu @jikunshang
|
||||||
|
|
||||||
@ -65,6 +90,9 @@ mkdocs.yaml @hmellor
|
|||||||
/vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow
|
/vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow
|
||||||
/vllm/model_executor/models/qwen* @sighingnow
|
/vllm/model_executor/models/qwen* @sighingnow
|
||||||
|
|
||||||
|
# MTP-specific files
|
||||||
|
/vllm/model_executor/models/deepseek_mtp.py @luccafong
|
||||||
|
|
||||||
# Mistral-specific files
|
# Mistral-specific files
|
||||||
/vllm/model_executor/models/mistral*.py @patrickvonplaten
|
/vllm/model_executor/models/mistral*.py @patrickvonplaten
|
||||||
/vllm/model_executor/models/mixtral*.py @patrickvonplaten
|
/vllm/model_executor/models/mixtral*.py @patrickvonplaten
|
||||||
@ -72,3 +100,23 @@ mkdocs.yaml @hmellor
|
|||||||
/vllm/model_executor/models/pixtral*.py @patrickvonplaten
|
/vllm/model_executor/models/pixtral*.py @patrickvonplaten
|
||||||
/vllm/transformers_utils/configs/mistral.py @patrickvonplaten
|
/vllm/transformers_utils/configs/mistral.py @patrickvonplaten
|
||||||
/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
|
/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten
|
||||||
|
|
||||||
|
# Kernels
|
||||||
|
/vllm/attention/ops/chunked_prefill_paged_decode.py @tdoublep
|
||||||
|
/vllm/attention/ops/triton_unified_attention.py @tdoublep
|
||||||
|
|
||||||
|
# ROCm related: specify owner with write access to notify AMD folks for careful code review
|
||||||
|
/docker/Dockerfile.rocm* @gshtras
|
||||||
|
/vllm/v1/attention/backends/rocm*.py @gshtras
|
||||||
|
/vllm/v1/attention/backends/mla/rocm*.py @gshtras
|
||||||
|
/vllm/attention/ops/rocm*.py @gshtras
|
||||||
|
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
|
||||||
|
|
||||||
|
# TPU
|
||||||
|
/vllm/v1/worker/tpu* @NickLucche
|
||||||
|
/vllm/platforms/tpu.py @NickLucche
|
||||||
|
/vllm/v1/sample/tpu @NickLucche
|
||||||
|
/vllm/tests/v1/tpu @NickLucche
|
||||||
|
|
||||||
|
# KVConnector installation files
|
||||||
|
/requirements/kv_connectors.txt @NickLucche
|
||||||
|
|||||||
4
.github/ISSUE_TEMPLATE/750-RFC.yml
vendored
4
.github/ISSUE_TEMPLATE/750-RFC.yml
vendored
@ -43,10 +43,6 @@ body:
|
|||||||
Any other things you would like to mention.
|
Any other things you would like to mention.
|
||||||
validations:
|
validations:
|
||||||
required: false
|
required: false
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: >
|
|
||||||
Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit).
|
|
||||||
- type: checkboxes
|
- type: checkboxes
|
||||||
id: askllm
|
id: askllm
|
||||||
attributes:
|
attributes:
|
||||||
|
|||||||
3
.github/PULL_REQUEST_TEMPLATE.md
vendored
3
.github/PULL_REQUEST_TEMPLATE.md
vendored
@ -7,8 +7,6 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTT
|
|||||||
|
|
||||||
## Test Result
|
## Test Result
|
||||||
|
|
||||||
## (Optional) Documentation Update
|
|
||||||
|
|
||||||
---
|
---
|
||||||
<details>
|
<details>
|
||||||
<summary> Essential Elements of an Effective PR Description Checklist </summary>
|
<summary> Essential Elements of an Effective PR Description Checklist </summary>
|
||||||
@ -17,6 +15,7 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTT
|
|||||||
- [ ] The test plan, such as providing test command.
|
- [ ] The test plan, such as providing test command.
|
||||||
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
|
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
|
||||||
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
|
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
|
||||||
|
- [ ] (Optional) Release notes update. If your change is user facing, please update the release notes draft in the [Google Doc](https://docs.google.com/document/d/1YyVqrgX4gHTtrstbq8oWUImOyPCKSGnJ7xtTpmXzlRs/edit?tab=t.0).
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
|
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
|
||||||
|
|||||||
73
.github/mergify.yml
vendored
73
.github/mergify.yml
vendored
@ -2,6 +2,7 @@ pull_request_rules:
|
|||||||
- name: label-documentation
|
- name: label-documentation
|
||||||
description: Automatically apply documentation label
|
description: Automatically apply documentation label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^[^/]+\.md$
|
- files~=^[^/]+\.md$
|
||||||
- files~=^docs/
|
- files~=^docs/
|
||||||
@ -14,6 +15,7 @@ pull_request_rules:
|
|||||||
- name: label-ci-build
|
- name: label-ci-build
|
||||||
description: Automatically apply ci/build label
|
description: Automatically apply ci/build label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^\.github/
|
- files~=^\.github/
|
||||||
- files~=\.buildkite/
|
- files~=\.buildkite/
|
||||||
@ -30,6 +32,7 @@ pull_request_rules:
|
|||||||
- name: label-deepseek
|
- name: label-deepseek
|
||||||
description: Automatically apply deepseek label
|
description: Automatically apply deepseek label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*deepseek.*\.py
|
- files~=^examples/.*deepseek.*\.py
|
||||||
- files~=^tests/.*deepseek.*\.py
|
- files~=^tests/.*deepseek.*\.py
|
||||||
@ -46,6 +49,7 @@ pull_request_rules:
|
|||||||
- name: label-frontend
|
- name: label-frontend
|
||||||
description: Automatically apply frontend label
|
description: Automatically apply frontend label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- files~=^vllm/entrypoints/
|
- files~=^vllm/entrypoints/
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
@ -55,6 +59,7 @@ pull_request_rules:
|
|||||||
- name: label-llama
|
- name: label-llama
|
||||||
description: Automatically apply llama label
|
description: Automatically apply llama label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*llama.*\.py
|
- files~=^examples/.*llama.*\.py
|
||||||
- files~=^tests/.*llama.*\.py
|
- files~=^tests/.*llama.*\.py
|
||||||
@ -70,6 +75,7 @@ pull_request_rules:
|
|||||||
- name: label-multi-modality
|
- name: label-multi-modality
|
||||||
description: Automatically apply multi-modality label
|
description: Automatically apply multi-modality label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/multimodal/
|
- files~=^vllm/multimodal/
|
||||||
- files~=^tests/multimodal/
|
- files~=^tests/multimodal/
|
||||||
@ -83,6 +89,7 @@ pull_request_rules:
|
|||||||
- name: label-new-model
|
- name: label-new-model
|
||||||
description: Automatically apply new-model label
|
description: Automatically apply new-model label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- and:
|
- and:
|
||||||
- files~=^vllm/model_executor/models/
|
- files~=^vllm/model_executor/models/
|
||||||
- files=vllm/model_executor/models/registry.py
|
- files=vllm/model_executor/models/registry.py
|
||||||
@ -94,6 +101,7 @@ pull_request_rules:
|
|||||||
- name: label-performance
|
- name: label-performance
|
||||||
description: Automatically apply performance label
|
description: Automatically apply performance label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^benchmarks/
|
- files~=^benchmarks/
|
||||||
- files~=^vllm/benchmarks/
|
- files~=^vllm/benchmarks/
|
||||||
@ -107,6 +115,7 @@ pull_request_rules:
|
|||||||
- name: label-qwen
|
- name: label-qwen
|
||||||
description: Automatically apply qwen label
|
description: Automatically apply qwen label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*qwen.*\.py
|
- files~=^examples/.*qwen.*\.py
|
||||||
- files~=^tests/.*qwen.*\.py
|
- files~=^tests/.*qwen.*\.py
|
||||||
@ -121,12 +130,20 @@ pull_request_rules:
|
|||||||
- name: label-gpt-oss
|
- name: label-gpt-oss
|
||||||
description: Automatically apply gpt-oss label
|
description: Automatically apply gpt-oss label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^examples/.*gpt[-_]?oss.*\.py
|
- files~=^examples/.*gpt[-_]?oss.*\.py
|
||||||
- files~=^tests/.*gpt[-_]?oss.*\.py
|
- files~=^tests/.*gpt[-_]?oss.*\.py
|
||||||
|
- files~=^tests/entrypoints/openai/test_response_api_with_harmony.py
|
||||||
|
- files~=^tests/entrypoints/test_context.py
|
||||||
- files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
|
- files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
|
||||||
- files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
|
- files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
|
||||||
|
- files~=^vllm/entrypoints/harmony_utils.py
|
||||||
|
- files~=^vllm/entrypoints/tool_server.py
|
||||||
|
- files~=^vllm/entrypoints/tool.py
|
||||||
|
- files~=^vllm/entrypoints/context.py
|
||||||
- title~=(?i)gpt[-_]?oss
|
- title~=(?i)gpt[-_]?oss
|
||||||
|
- title~=(?i)harmony
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
add:
|
add:
|
||||||
@ -135,6 +152,7 @@ pull_request_rules:
|
|||||||
- name: label-rocm
|
- name: label-rocm
|
||||||
description: Automatically apply rocm label
|
description: Automatically apply rocm label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^csrc/rocm/
|
- files~=^csrc/rocm/
|
||||||
- files~=^docker/Dockerfile.rocm
|
- files~=^docker/Dockerfile.rocm
|
||||||
@ -155,6 +173,7 @@ pull_request_rules:
|
|||||||
- name: label-structured-output
|
- name: label-structured-output
|
||||||
description: Automatically apply structured-output label
|
description: Automatically apply structured-output label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^benchmarks/structured_schemas/
|
- files~=^benchmarks/structured_schemas/
|
||||||
- files=benchmarks/benchmark_serving_structured_output.py
|
- files=benchmarks/benchmark_serving_structured_output.py
|
||||||
@ -164,7 +183,7 @@ pull_request_rules:
|
|||||||
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
||||||
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
||||||
- files~=^tests/v1/structured_output/
|
- files~=^tests/v1/structured_output/
|
||||||
- files=tests/v1/entrypoints/llm/test_guided_generate.py
|
- files=tests/v1/entrypoints/llm/test_struct_output_generate.py
|
||||||
- files~=^vllm/v1/structured_output/
|
- files~=^vllm/v1/structured_output/
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
@ -174,6 +193,7 @@ pull_request_rules:
|
|||||||
- name: label-speculative-decoding
|
- name: label-speculative-decoding
|
||||||
description: Automatically apply speculative-decoding label
|
description: Automatically apply speculative-decoding label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/v1/spec_decode/
|
- files~=^vllm/v1/spec_decode/
|
||||||
- files~=^tests/v1/spec_decode/
|
- files~=^tests/v1/spec_decode/
|
||||||
@ -189,6 +209,7 @@ pull_request_rules:
|
|||||||
- name: label-v1
|
- name: label-v1
|
||||||
description: Automatically apply v1 label
|
description: Automatically apply v1 label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/v1/
|
- files~=^vllm/v1/
|
||||||
- files~=^tests/v1/
|
- files~=^tests/v1/
|
||||||
@ -201,6 +222,7 @@ pull_request_rules:
|
|||||||
description: Automatically apply tpu label
|
description: Automatically apply tpu label
|
||||||
# Keep this list in sync with `label-tpu-remove` conditions
|
# Keep this list in sync with `label-tpu-remove` conditions
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=tpu.py
|
- files~=tpu.py
|
||||||
- files~=_tpu
|
- files~=_tpu
|
||||||
@ -216,6 +238,7 @@ pull_request_rules:
|
|||||||
description: Automatically remove tpu label
|
description: Automatically remove tpu label
|
||||||
# Keep this list in sync with `label-tpu` conditions
|
# Keep this list in sync with `label-tpu` conditions
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- and:
|
- and:
|
||||||
- -files~=tpu.py
|
- -files~=tpu.py
|
||||||
- -files~=_tpu
|
- -files~=_tpu
|
||||||
@ -230,9 +253,9 @@ pull_request_rules:
|
|||||||
- name: label-tool-calling
|
- name: label-tool-calling
|
||||||
description: Automatically add tool-calling label
|
description: Automatically add tool-calling label
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
- or:
|
- or:
|
||||||
- files~=^tests/tool_use/
|
- files~=^tests/tool_use/
|
||||||
- files~=^tests/mistral_tool_use/
|
|
||||||
- files~=^tests/entrypoints/openai/tool_parsers/
|
- files~=^tests/entrypoints/openai/tool_parsers/
|
||||||
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
||||||
- files~=^vllm/entrypoints/openai/tool_parsers/
|
- files~=^vllm/entrypoints/openai/tool_parsers/
|
||||||
@ -249,8 +272,9 @@ pull_request_rules:
|
|||||||
|
|
||||||
- name: ping author on conflicts and add 'needs-rebase' label
|
- name: ping author on conflicts and add 'needs-rebase' label
|
||||||
conditions:
|
conditions:
|
||||||
- conflict
|
- label != stale
|
||||||
- -closed
|
- conflict
|
||||||
|
- -closed
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
add:
|
add:
|
||||||
@ -264,20 +288,55 @@ pull_request_rules:
|
|||||||
|
|
||||||
- name: assign reviewer for tensorizer changes
|
- name: assign reviewer for tensorizer changes
|
||||||
conditions:
|
conditions:
|
||||||
|
- label != stale
|
||||||
|
- or:
|
||||||
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
||||||
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
||||||
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
- files~=^tests/tensorizer_loader/
|
- files~=^tests/model_executor/model_loader/tensorizer_loader/
|
||||||
actions:
|
actions:
|
||||||
assign:
|
assign:
|
||||||
users:
|
users:
|
||||||
- "sangstar"
|
- "sangstar"
|
||||||
|
|
||||||
|
- name: assign reviewer for modelopt changes
|
||||||
|
conditions:
|
||||||
|
- label != stale
|
||||||
|
- or:
|
||||||
|
- files~=^vllm/model_executor/layers/quantization/modelopt\.py$
|
||||||
|
- files~=^vllm/model_executor/layers/quantization/__init__\.py$
|
||||||
|
- files~=^tests/models/quantization/test_modelopt\.py$
|
||||||
|
- files~=^tests/quantization/test_modelopt\.py$
|
||||||
|
- files~=^tests/models/quantization/test_nvfp4\.py$
|
||||||
|
- files~=^docs/features/quantization/modelopt\.md$
|
||||||
|
actions:
|
||||||
|
assign:
|
||||||
|
users:
|
||||||
|
- "Edwardf0t1"
|
||||||
|
|
||||||
- name: remove 'needs-rebase' label when conflict is resolved
|
- name: remove 'needs-rebase' label when conflict is resolved
|
||||||
conditions:
|
conditions:
|
||||||
- -conflict
|
- -conflict
|
||||||
- -closed
|
- -closed
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
remove:
|
remove:
|
||||||
- needs-rebase
|
- needs-rebase
|
||||||
|
|
||||||
|
- name: label-kv-connector
|
||||||
|
description: Automatically apply kv-connector label
|
||||||
|
conditions:
|
||||||
|
- label != stale
|
||||||
|
- or:
|
||||||
|
- files~=^examples/online_serving/disaggregated[^/]*/.*
|
||||||
|
- files~=^examples/offline_inference/disaggregated[^/]*/.*
|
||||||
|
- files~=^examples/others/lmcache/
|
||||||
|
- files~=^tests/v1/kv_connector/
|
||||||
|
- files~=^vllm/distributed/kv_transfer/
|
||||||
|
- title~=(?i)\bP/?D\b
|
||||||
|
- title~=(?i)NIXL
|
||||||
|
- title~=(?i)LMCache
|
||||||
|
actions:
|
||||||
|
label:
|
||||||
|
add:
|
||||||
|
- kv-connector
|
||||||
21
.github/scale-config.yml
vendored
Normal file
21
.github/scale-config.yml
vendored
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
# scale-config.yml:
|
||||||
|
# Powers what instance types are available for GHA auto-scaled
|
||||||
|
# runners. Runners listed here will be available as self hosted
|
||||||
|
# runners, configuration is directly pulled from the main branch.
|
||||||
|
# runner_types:
|
||||||
|
# runner_label:
|
||||||
|
# instance_type: m4.large
|
||||||
|
# os: linux
|
||||||
|
# # min_available defaults to the global cfg in the ALI Terraform
|
||||||
|
# min_available: undefined
|
||||||
|
# # when max_available value is not defined, no max runners is enforced
|
||||||
|
# max_available: undefined
|
||||||
|
# disk_size: 50
|
||||||
|
# is_ephemeral: true
|
||||||
|
|
||||||
|
runner_types:
|
||||||
|
linux.2xlarge:
|
||||||
|
disk_size: 150
|
||||||
|
instance_type: c5.2xlarge
|
||||||
|
is_ephemeral: true
|
||||||
|
os: linux
|
||||||
2
.github/workflows/add_label_automerge.yml
vendored
2
.github/workflows/add_label_automerge.yml
vendored
@ -10,7 +10,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Add label
|
- name: Add label
|
||||||
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
|
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||||
with:
|
with:
|
||||||
script: |
|
script: |
|
||||||
github.rest.issues.addLabels({
|
github.rest.issues.addLabels({
|
||||||
|
|||||||
29
.github/workflows/bc-lint.yml
vendored
Normal file
29
.github/workflows/bc-lint.yml
vendored
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
name: BC Lint
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
types:
|
||||||
|
- opened
|
||||||
|
- synchronize
|
||||||
|
- reopened
|
||||||
|
- labeled
|
||||||
|
- unlabeled
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
bc_lint:
|
||||||
|
if: github.repository_owner == 'vllm-project'
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Run BC Lint Action
|
||||||
|
uses: pytorch/test-infra/.github/actions/bc-lint@main
|
||||||
|
with:
|
||||||
|
repo: ${{ github.event.pull_request.head.repo.full_name }}
|
||||||
|
base_sha: ${{ github.event.pull_request.base.sha }}
|
||||||
|
head_sha: ${{ github.event.pull_request.head.sha }}
|
||||||
|
suppression: ${{ contains(github.event.pull_request.labels.*.name, 'suppress-bc-linter') }}
|
||||||
|
docs_link: 'https://github.com/pytorch/test-infra/wiki/BC-Linter'
|
||||||
|
config_dir: .github
|
||||||
|
|
||||||
|
concurrency:
|
||||||
|
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}
|
||||||
|
cancel-in-progress: true
|
||||||
2
.github/workflows/cleanup_pr_body.yml
vendored
2
.github/workflows/cleanup_pr_body.yml
vendored
@ -16,7 +16,7 @@ jobs:
|
|||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
|
uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
|
||||||
with:
|
with:
|
||||||
python-version: '3.12'
|
python-version: '3.12'
|
||||||
|
|
||||||
|
|||||||
309
.github/workflows/issue_autolabel.yml
vendored
Normal file
309
.github/workflows/issue_autolabel.yml
vendored
Normal file
@ -0,0 +1,309 @@
|
|||||||
|
name: Label issues based on keywords
|
||||||
|
on:
|
||||||
|
issues:
|
||||||
|
types: [opened, edited, reopened]
|
||||||
|
permissions:
|
||||||
|
issues: write # needed so the workflow can add labels
|
||||||
|
contents: read
|
||||||
|
concurrency:
|
||||||
|
group: issue-labeler-${{ github.event.issue.number }}
|
||||||
|
cancel-in-progress: true
|
||||||
|
jobs:
|
||||||
|
add-labels:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Label issues based on keywords
|
||||||
|
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||||
|
with:
|
||||||
|
script: |
|
||||||
|
// Configuration: Add new labels and keywords here
|
||||||
|
const labelConfig = {
|
||||||
|
rocm: {
|
||||||
|
// Keyword search - matches whole words only (with word boundaries)
|
||||||
|
keywords: [
|
||||||
|
{
|
||||||
|
term: "composable kernel",
|
||||||
|
searchIn: "both"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
term: "rccl",
|
||||||
|
searchIn: "body" // only search in body
|
||||||
|
},
|
||||||
|
{
|
||||||
|
term: "migraphx",
|
||||||
|
searchIn: "title" // only search in title
|
||||||
|
},
|
||||||
|
{
|
||||||
|
term: "hipgraph",
|
||||||
|
searchIn: "both"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
term: "ROCm System Management Interface",
|
||||||
|
searchIn: "body"
|
||||||
|
},
|
||||||
|
],
|
||||||
|
|
||||||
|
// Substring search - matches anywhere in text (partial matches)
|
||||||
|
substrings: [
|
||||||
|
{
|
||||||
|
term: "VLLM_ROCM_",
|
||||||
|
searchIn: "both"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
term: "aiter",
|
||||||
|
searchIn: "title"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
term: "rocm",
|
||||||
|
searchIn: "title"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
term: "amd",
|
||||||
|
searchIn: "title"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
term: "hip-",
|
||||||
|
searchIn: "both"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
term: "gfx",
|
||||||
|
searchIn: "both"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
term: "cdna",
|
||||||
|
searchIn: "both"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
term: "rdna",
|
||||||
|
searchIn: "both"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
term: "torch_hip",
|
||||||
|
searchIn: "body" // only in body
|
||||||
|
},
|
||||||
|
{
|
||||||
|
term: "_hip",
|
||||||
|
searchIn: "both"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
term: "hip_",
|
||||||
|
searchIn: "both"
|
||||||
|
},
|
||||||
|
|
||||||
|
// ROCm tools and libraries
|
||||||
|
{
|
||||||
|
term: "hipify",
|
||||||
|
searchIn: "both"
|
||||||
|
},
|
||||||
|
],
|
||||||
|
|
||||||
|
// Regex patterns - for complex pattern matching
|
||||||
|
regexPatterns: [
|
||||||
|
{
|
||||||
|
pattern: "\\bmi\\d{3}[a-z]*\\b",
|
||||||
|
description: "AMD GPU names (mi + 3 digits + optional letters)",
|
||||||
|
flags: "gi",
|
||||||
|
searchIn: "both" // "title", "body", or "both"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
// Helper function to create regex based on search type
|
||||||
|
function createSearchRegex(term, type) {
|
||||||
|
// Escape special regex characters in the term
|
||||||
|
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||||
|
|
||||||
|
switch (type) {
|
||||||
|
case 'keyword':
|
||||||
|
// Word boundary search - matches whole words only
|
||||||
|
return new RegExp(`\\b${escapedTerm}\\b`, "gi");
|
||||||
|
case 'substring':
|
||||||
|
// Substring search - matches anywhere in the text
|
||||||
|
return new RegExp(escapedTerm, "gi");
|
||||||
|
default:
|
||||||
|
throw new Error(`Unknown search type: ${type}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to find matching terms in text with line information
|
||||||
|
function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
|
||||||
|
const matches = [];
|
||||||
|
const lines = text.split('\n');
|
||||||
|
|
||||||
|
for (const termConfig of searchTerms) {
|
||||||
|
let regex;
|
||||||
|
let term, searchIn, pattern, description, flags;
|
||||||
|
|
||||||
|
// Handle different input formats (string or object)
|
||||||
|
if (typeof termConfig === 'string') {
|
||||||
|
term = termConfig;
|
||||||
|
searchIn = 'both'; // default
|
||||||
|
} else {
|
||||||
|
term = termConfig.term;
|
||||||
|
searchIn = termConfig.searchIn || 'both';
|
||||||
|
pattern = termConfig.pattern;
|
||||||
|
description = termConfig.description;
|
||||||
|
flags = termConfig.flags;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Skip if this term shouldn't be searched in the current location
|
||||||
|
if (searchIn !== 'both' && searchIn !== searchLocation) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create appropriate regex
|
||||||
|
if (searchType === 'regex') {
|
||||||
|
regex = new RegExp(pattern, flags || "gi");
|
||||||
|
} else {
|
||||||
|
regex = createSearchRegex(term, searchType);
|
||||||
|
}
|
||||||
|
|
||||||
|
const termMatches = [];
|
||||||
|
|
||||||
|
// Check each line for matches
|
||||||
|
lines.forEach((line, lineIndex) => {
|
||||||
|
const lineMatches = line.match(regex);
|
||||||
|
if (lineMatches) {
|
||||||
|
lineMatches.forEach(match => {
|
||||||
|
termMatches.push({
|
||||||
|
match: match,
|
||||||
|
lineNumber: lineIndex + 1,
|
||||||
|
lineContent: line.trim(),
|
||||||
|
searchType: searchType,
|
||||||
|
searchLocation: searchLocation,
|
||||||
|
originalTerm: term || pattern,
|
||||||
|
description: description,
|
||||||
|
// Show context around the match in the line
|
||||||
|
context: line.length > 100 ?
|
||||||
|
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
|
||||||
|
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
|
||||||
|
: line.trim()
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if (termMatches.length > 0) {
|
||||||
|
matches.push({
|
||||||
|
term: term || (description || pattern),
|
||||||
|
searchType: searchType,
|
||||||
|
searchLocation: searchLocation,
|
||||||
|
searchIn: searchIn,
|
||||||
|
pattern: pattern,
|
||||||
|
matches: termMatches,
|
||||||
|
count: termMatches.length
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return matches;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper function to check if label should be added
|
||||||
|
async function processLabel(labelName, config) {
|
||||||
|
const body = context.payload.issue.body || "";
|
||||||
|
const title = context.payload.issue.title || "";
|
||||||
|
|
||||||
|
core.notice(`Processing label: ${labelName}`);
|
||||||
|
core.notice(`Issue Title: "${title}"`);
|
||||||
|
core.notice(`Issue Body length: ${body.length} characters`);
|
||||||
|
|
||||||
|
let shouldAddLabel = false;
|
||||||
|
let allMatches = [];
|
||||||
|
let reason = '';
|
||||||
|
|
||||||
|
const keywords = config.keywords || [];
|
||||||
|
const substrings = config.substrings || [];
|
||||||
|
const regexPatterns = config.regexPatterns || [];
|
||||||
|
|
||||||
|
core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
|
||||||
|
|
||||||
|
// Search in title
|
||||||
|
if (title.trim()) {
|
||||||
|
core.notice(`Searching in title: "${title}"`);
|
||||||
|
|
||||||
|
const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
|
||||||
|
const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
|
||||||
|
const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
|
||||||
|
|
||||||
|
allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Search in body
|
||||||
|
if (body.trim()) {
|
||||||
|
core.notice(`Searching in body (${body.length} characters)`);
|
||||||
|
|
||||||
|
const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
|
||||||
|
const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
|
||||||
|
const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
|
||||||
|
|
||||||
|
allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (allMatches.length > 0) {
|
||||||
|
core.notice(`Found ${allMatches.length} matching term(s):`);
|
||||||
|
|
||||||
|
for (const termMatch of allMatches) {
|
||||||
|
const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
|
||||||
|
const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
|
||||||
|
|
||||||
|
if (termMatch.searchType === 'regex') {
|
||||||
|
core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
||||||
|
} else {
|
||||||
|
core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Show details for each match
|
||||||
|
termMatch.matches.forEach((match, index) => {
|
||||||
|
core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
|
||||||
|
if (match.description) {
|
||||||
|
core.notice(` Description: ${match.description}`);
|
||||||
|
}
|
||||||
|
core.notice(` Context: ${match.context}`);
|
||||||
|
if (match.lineContent !== match.context) {
|
||||||
|
core.notice(` Full line: ${match.lineContent}`);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
shouldAddLabel = true;
|
||||||
|
const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
|
||||||
|
const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
|
||||||
|
const bodyMatches = allMatches.filter(t => t.searchLocation === 'body').reduce((sum, t) => sum + t.count, 0);
|
||||||
|
const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
|
||||||
|
const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
|
||||||
|
const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
|
||||||
|
|
||||||
|
reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
|
||||||
|
}
|
||||||
|
|
||||||
|
core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
|
||||||
|
core.notice(`Reason: ${reason || 'No matching terms found'}`);
|
||||||
|
|
||||||
|
if (shouldAddLabel) {
|
||||||
|
const existingLabels = context.payload.issue.labels.map(l => l.name);
|
||||||
|
if (!existingLabels.includes(labelName)) {
|
||||||
|
await github.rest.issues.addLabels({
|
||||||
|
owner: context.repo.owner,
|
||||||
|
repo: context.repo.repo,
|
||||||
|
issue_number: context.issue.number,
|
||||||
|
labels: [labelName],
|
||||||
|
});
|
||||||
|
core.notice(`Label "${labelName}" added. ${reason}`);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
core.notice(`Label "${labelName}" already present.`);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
core.notice(`No matching terms found for label "${labelName}".`);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Process all configured labels
|
||||||
|
const processLabels = Object.entries(labelConfig)
|
||||||
|
.map(([labelName, config]) => processLabel(labelName, config));
|
||||||
|
const labelsAdded = await Promise.all(processLabels);
|
||||||
|
const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
|
||||||
|
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
|
||||||
89
.github/workflows/lint-and-deploy.yaml
vendored
89
.github/workflows/lint-and-deploy.yaml
vendored
@ -1,89 +0,0 @@
|
|||||||
name: Lint and Deploy Charts
|
|
||||||
|
|
||||||
on: pull_request
|
|
||||||
|
|
||||||
concurrency:
|
|
||||||
group: ${{ github.workflow }}-${{ github.ref }}
|
|
||||||
cancel-in-progress: true
|
|
||||||
|
|
||||||
permissions:
|
|
||||||
contents: read
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
lint-and-deploy:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
|
|
||||||
- name: Set up Helm
|
|
||||||
uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0
|
|
||||||
with:
|
|
||||||
version: v3.14.4
|
|
||||||
|
|
||||||
#Python is required because ct lint runs Yamale and yamllint which require Python.
|
|
||||||
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
|
|
||||||
with:
|
|
||||||
python-version: '3.13'
|
|
||||||
|
|
||||||
- name: Set up chart-testing
|
|
||||||
uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b # v2.7.0
|
|
||||||
with:
|
|
||||||
version: v3.10.1
|
|
||||||
|
|
||||||
- name: Run chart-testing (lint)
|
|
||||||
run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm
|
|
||||||
|
|
||||||
- name: Setup minio
|
|
||||||
run: |
|
|
||||||
docker network create vllm-net
|
|
||||||
docker run -d -p 9000:9000 --name minio --net vllm-net \
|
|
||||||
-e "MINIO_ACCESS_KEY=minioadmin" \
|
|
||||||
-e "MINIO_SECRET_KEY=minioadmin" \
|
|
||||||
-v /tmp/data:/data \
|
|
||||||
-v /tmp/config:/root/.minio \
|
|
||||||
minio/minio server /data
|
|
||||||
export AWS_ACCESS_KEY_ID=minioadmin
|
|
||||||
export AWS_SECRET_ACCESS_KEY=minioadmin
|
|
||||||
export AWS_EC2_METADATA_DISABLED=true
|
|
||||||
mkdir opt-125m
|
|
||||||
cd opt-125m && curl -O -Ls "https://huggingface.co/facebook/opt-125m/resolve/main/{pytorch_model.bin,config.json,generation_config.json,merges.txt,special_tokens_map.json,tokenizer_config.json,vocab.json}" && cd ..
|
|
||||||
aws --endpoint-url http://127.0.0.1:9000/ s3 mb s3://testbucket
|
|
||||||
aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
|
|
||||||
|
|
||||||
- name: Create kind cluster
|
|
||||||
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
|
|
||||||
|
|
||||||
- name: Build the Docker image vllm cpu
|
|
||||||
run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
|
|
||||||
|
|
||||||
- name: Configuration of docker images, network and namespace for the kind cluster
|
|
||||||
run: |
|
|
||||||
docker pull amazon/aws-cli:2.6.4
|
|
||||||
kind load docker-image amazon/aws-cli:2.6.4 --name chart-testing
|
|
||||||
kind load docker-image vllm-cpu-env:latest --name chart-testing
|
|
||||||
docker network connect vllm-net "$(docker ps -aqf "name=chart-testing-control-plane")"
|
|
||||||
kubectl create ns ns-vllm
|
|
||||||
|
|
||||||
- name: Run chart-testing (install)
|
|
||||||
run: |
|
|
||||||
export AWS_ACCESS_KEY_ID=minioadmin
|
|
||||||
export AWS_SECRET_ACCESS_KEY=minioadmin
|
|
||||||
sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
|
|
||||||
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set image.env[2].name=VLLM_CPU_CI_ENV --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string image.env[2].value="1" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
|
|
||||||
|
|
||||||
- name: curl test
|
|
||||||
run: |
|
|
||||||
kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
|
|
||||||
sleep 10
|
|
||||||
CODE="$(curl -v -f --location http://localhost:8001/v1/completions \
|
|
||||||
--header "Content-Type: application/json" \
|
|
||||||
--data '{
|
|
||||||
"model": "opt-125m",
|
|
||||||
"prompt": "San Francisco is a",
|
|
||||||
"max_tokens": 7,
|
|
||||||
"temperature": 0
|
|
||||||
}'):$CODE"
|
|
||||||
echo "$CODE"
|
|
||||||
2
.github/workflows/pre-commit.yml
vendored
2
.github/workflows/pre-commit.yml
vendored
@ -17,7 +17,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
|
- uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
|
||||||
with:
|
with:
|
||||||
python-version: "3.12"
|
python-version: "3.12"
|
||||||
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
||||||
|
|||||||
111
.github/workflows/publish.yml
vendored
111
.github/workflows/publish.yml
vendored
@ -1,111 +0,0 @@
|
|||||||
# This workflow will upload a Python Package to Release asset
|
|
||||||
# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions
|
|
||||||
|
|
||||||
name: Create Release
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
tags:
|
|
||||||
- v*
|
|
||||||
|
|
||||||
# Needed to create release and upload assets
|
|
||||||
permissions:
|
|
||||||
contents: write
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
release:
|
|
||||||
# Retrieve tag and create release
|
|
||||||
name: Create Release
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
outputs:
|
|
||||||
upload_url: ${{ steps.create_release.outputs.upload_url }}
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
||||||
|
|
||||||
- name: Extract branch info
|
|
||||||
shell: bash
|
|
||||||
run: |
|
|
||||||
echo "release_tag=${GITHUB_REF#refs/*/}" >> "$GITHUB_ENV"
|
|
||||||
|
|
||||||
- name: Create Release
|
|
||||||
id: create_release
|
|
||||||
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
|
|
||||||
env:
|
|
||||||
RELEASE_TAG: ${{ env.release_tag }}
|
|
||||||
with:
|
|
||||||
github-token: "${{ secrets.GITHUB_TOKEN }}"
|
|
||||||
script: |
|
|
||||||
const script = require('.github/workflows/scripts/create_release.js')
|
|
||||||
await script(github, context, core)
|
|
||||||
|
|
||||||
# NOTE(simon): No longer build wheel using GitHub Actions. See buildkite's release workflow.
|
|
||||||
# wheel:
|
|
||||||
# name: Build Wheel
|
|
||||||
# runs-on: ${{ matrix.os }}
|
|
||||||
# needs: release
|
|
||||||
|
|
||||||
# strategy:
|
|
||||||
# fail-fast: false
|
|
||||||
# matrix:
|
|
||||||
# os: ['ubuntu-20.04']
|
|
||||||
# python-version: ['3.9', '3.10', '3.11', '3.12']
|
|
||||||
# pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements/cuda.txt.
|
|
||||||
# cuda-version: ['11.8', '12.1']
|
|
||||||
|
|
||||||
# steps:
|
|
||||||
# - name: Checkout
|
|
||||||
# uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
||||||
|
|
||||||
# - name: Setup ccache
|
|
||||||
# uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
|
|
||||||
# with:
|
|
||||||
# create-symlink: true
|
|
||||||
# key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
|
|
||||||
|
|
||||||
# - name: Set up Linux Env
|
|
||||||
# if: ${{ runner.os == 'Linux' }}
|
|
||||||
# run: |
|
|
||||||
# bash -x .github/workflows/scripts/env.sh
|
|
||||||
|
|
||||||
# - name: Set up Python
|
|
||||||
# uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
|
||||||
# with:
|
|
||||||
# python-version: ${{ matrix.python-version }}
|
|
||||||
|
|
||||||
# - name: Install CUDA ${{ matrix.cuda-version }}
|
|
||||||
# run: |
|
|
||||||
# bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
|
|
||||||
|
|
||||||
# - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
|
|
||||||
# run: |
|
|
||||||
# bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
|
|
||||||
|
|
||||||
# - name: Build wheel
|
|
||||||
# shell: bash
|
|
||||||
# env:
|
|
||||||
# CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
|
|
||||||
# run: |
|
|
||||||
# bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
|
|
||||||
# wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
|
|
||||||
# asset_name=${wheel_name//"linux"/"manylinux1"}
|
|
||||||
# echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
|
|
||||||
# echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
|
|
||||||
|
|
||||||
# - name: Upload Release Asset
|
|
||||||
# uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
|
|
||||||
# env:
|
|
||||||
# GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
||||||
# with:
|
|
||||||
# upload_url: ${{ needs.release.outputs.upload_url }}
|
|
||||||
# asset_path: ./dist/${{ env.wheel_name }}
|
|
||||||
# asset_name: ${{ env.asset_name }}
|
|
||||||
# asset_content_type: application/*
|
|
||||||
|
|
||||||
# (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
|
|
||||||
# - name: Publish package
|
|
||||||
# uses: pypa/gh-action-pypi-publish@release/v1.8
|
|
||||||
# with:
|
|
||||||
# repository-url: https://test.pypi.org/legacy/
|
|
||||||
# password: ${{ secrets.PYPI_API_TOKEN }}
|
|
||||||
# skip-existing: true
|
|
||||||
51
.github/workflows/reminder_comment.yml
vendored
51
.github/workflows/reminder_comment.yml
vendored
@ -9,19 +9,46 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Remind to run full CI on PR
|
- name: Remind to run full CI on PR
|
||||||
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
|
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||||
with:
|
with:
|
||||||
script: |
|
script: |
|
||||||
github.rest.issues.createComment({
|
try {
|
||||||
owner: context.repo.owner,
|
// Get the PR author
|
||||||
repo: context.repo.repo,
|
const prAuthor = context.payload.pull_request.user.login;
|
||||||
issue_number: context.issue.number,
|
|
||||||
body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
|
// Check if this is the author's first PR in this repository
|
||||||
'💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
|
// Use GitHub's search API to find all PRs by this author
|
||||||
'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' +
|
const { data: searchResults } = await github.rest.search.issuesAndPullRequests({
|
||||||
'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
|
q: `repo:${context.repo.owner}/${context.repo.repo} type:pr author:${prAuthor}`,
|
||||||
'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
|
per_page: 100
|
||||||
'🚀'
|
});
|
||||||
})
|
|
||||||
|
const authorPRCount = searchResults.total_count;
|
||||||
|
|
||||||
|
console.log(`Found ${authorPRCount} PRs by ${prAuthor}`);
|
||||||
|
|
||||||
|
// Only post comment if this is the first PR (only one PR by this author)
|
||||||
|
if (authorPRCount === 1) {
|
||||||
|
console.log(`Posting welcome comment for first-time contributor: ${prAuthor}`);
|
||||||
|
await github.rest.issues.createComment({
|
||||||
|
owner: context.repo.owner,
|
||||||
|
repo: context.repo.repo,
|
||||||
|
issue_number: context.issue.number,
|
||||||
|
body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
|
||||||
|
'💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
|
||||||
|
'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. \n\n' +
|
||||||
|
'You ask your reviewers to trigger select CI tests on top of `fastcheck` CI. \n\n' +
|
||||||
|
'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
|
||||||
|
'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
|
||||||
|
'If you have any questions, please reach out to us on Slack at https://slack.vllm.ai.\n\n' +
|
||||||
|
'🚀'
|
||||||
|
});
|
||||||
|
} else {
|
||||||
|
console.log(`Skipping comment for ${prAuthor} - not their first PR (${authorPRCount} PRs found)`);
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Error checking PR history or posting comment:', error);
|
||||||
|
// Don't fail the workflow, just log the error
|
||||||
|
}
|
||||||
env:
|
env:
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|||||||
2
.github/workflows/stale.yml
vendored
2
.github/workflows/stale.yml
vendored
@ -13,7 +13,7 @@ jobs:
|
|||||||
actions: write
|
actions: write
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
|
- uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
|
||||||
with:
|
with:
|
||||||
# Increasing this value ensures that changes to this workflow
|
# Increasing this value ensures that changes to this workflow
|
||||||
# propagate to all issues and PRs in days rather than months
|
# propagate to all issues and PRs in days rather than months
|
||||||
|
|||||||
12
.gitignore
vendored
12
.gitignore
vendored
@ -4,7 +4,7 @@
|
|||||||
# vllm-flash-attn built from source
|
# vllm-flash-attn built from source
|
||||||
vllm/vllm_flash_attn/*
|
vllm/vllm_flash_attn/*
|
||||||
|
|
||||||
# triton jit
|
# triton jit
|
||||||
.triton
|
.triton
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
@ -177,6 +177,14 @@ cython_debug/
|
|||||||
# VSCode
|
# VSCode
|
||||||
.vscode/
|
.vscode/
|
||||||
|
|
||||||
|
# Claude
|
||||||
|
CLAUDE.md
|
||||||
|
.claude/
|
||||||
|
|
||||||
|
# Codex
|
||||||
|
AGENTS.md
|
||||||
|
.codex/
|
||||||
|
|
||||||
# DS Store
|
# DS Store
|
||||||
.DS_Store
|
.DS_Store
|
||||||
|
|
||||||
@ -209,4 +217,4 @@ shellcheck*/
|
|||||||
csrc/moe/marlin_moe_wna16/kernel_*
|
csrc/moe/marlin_moe_wna16/kernel_*
|
||||||
|
|
||||||
# Ignore ep_kernels_workspace folder
|
# Ignore ep_kernels_workspace folder
|
||||||
ep_kernels_workspace/
|
ep_kernels_workspace/
|
||||||
|
|||||||
@ -21,7 +21,7 @@ repos:
|
|||||||
- id: ruff-format
|
- id: ruff-format
|
||||||
files: ^(.buildkite|benchmarks|examples)/.*
|
files: ^(.buildkite|benchmarks|examples)/.*
|
||||||
- repo: https://github.com/crate-ci/typos
|
- repo: https://github.com/crate-ci/typos
|
||||||
rev: v1.34.0
|
rev: v1.35.5
|
||||||
hooks:
|
hooks:
|
||||||
- id: typos
|
- id: typos
|
||||||
- repo: https://github.com/PyCQA/isort
|
- repo: https://github.com/PyCQA/isort
|
||||||
@ -49,7 +49,7 @@ repos:
|
|||||||
rev: 0.6.17
|
rev: 0.6.17
|
||||||
hooks:
|
hooks:
|
||||||
- id: pip-compile
|
- id: pip-compile
|
||||||
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
|
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
|
||||||
files: ^requirements/test\.(in|txt)$
|
files: ^requirements/test\.(in|txt)$
|
||||||
- repo: local
|
- repo: local
|
||||||
hooks:
|
hooks:
|
||||||
@ -60,38 +60,32 @@ repos:
|
|||||||
files: ^requirements/test\.(in|txt)$
|
files: ^requirements/test\.(in|txt)$
|
||||||
- id: mypy-local
|
- id: mypy-local
|
||||||
name: Run mypy for local Python installation
|
name: Run mypy for local Python installation
|
||||||
entry: tools/mypy.sh 0 "local"
|
entry: python tools/pre_commit/mypy.py 0 "local"
|
||||||
language: python
|
|
||||||
types: [python]
|
|
||||||
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
|
|
||||||
stages: [pre-commit] # Don't run in CI
|
stages: [pre-commit] # Don't run in CI
|
||||||
|
<<: &mypy_common
|
||||||
|
language: python
|
||||||
|
types_or: [python, pyi]
|
||||||
|
require_serial: true
|
||||||
|
additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
|
||||||
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
name: Run mypy for Python 3.9
|
name: Run mypy for Python 3.9
|
||||||
entry: tools/mypy.sh 1 "3.9"
|
entry: python tools/pre_commit/mypy.py 1 "3.9"
|
||||||
language: python
|
<<: *mypy_common
|
||||||
types: [python]
|
|
||||||
additional_dependencies: *mypy_deps
|
|
||||||
stages: [manual] # Only run in CI
|
stages: [manual] # Only run in CI
|
||||||
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
name: Run mypy for Python 3.10
|
name: Run mypy for Python 3.10
|
||||||
entry: tools/mypy.sh 1 "3.10"
|
entry: python tools/pre_commit/mypy.py 1 "3.10"
|
||||||
language: python
|
<<: *mypy_common
|
||||||
types: [python]
|
|
||||||
additional_dependencies: *mypy_deps
|
|
||||||
stages: [manual] # Only run in CI
|
stages: [manual] # Only run in CI
|
||||||
- id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
- id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
name: Run mypy for Python 3.11
|
name: Run mypy for Python 3.11
|
||||||
entry: tools/mypy.sh 1 "3.11"
|
entry: python tools/pre_commit/mypy.py 1 "3.11"
|
||||||
language: python
|
<<: *mypy_common
|
||||||
types: [python]
|
|
||||||
additional_dependencies: *mypy_deps
|
|
||||||
stages: [manual] # Only run in CI
|
stages: [manual] # Only run in CI
|
||||||
- id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
- id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
name: Run mypy for Python 3.12
|
name: Run mypy for Python 3.12
|
||||||
entry: tools/mypy.sh 1 "3.12"
|
entry: python tools/pre_commit/mypy.py 1 "3.12"
|
||||||
language: python
|
<<: *mypy_common
|
||||||
types: [python]
|
|
||||||
additional_dependencies: *mypy_deps
|
|
||||||
stages: [manual] # Only run in CI
|
stages: [manual] # Only run in CI
|
||||||
- id: shellcheck
|
- id: shellcheck
|
||||||
name: Lint shell scripts
|
name: Lint shell scripts
|
||||||
@ -155,18 +149,15 @@ repos:
|
|||||||
additional_dependencies: [regex]
|
additional_dependencies: [regex]
|
||||||
- id: check-pickle-imports
|
- id: check-pickle-imports
|
||||||
name: Prevent new pickle/cloudpickle imports
|
name: Prevent new pickle/cloudpickle imports
|
||||||
entry: python tools/check_pickle_imports.py
|
entry: python tools/pre_commit/check_pickle_imports.py
|
||||||
language: python
|
language: python
|
||||||
types: [python]
|
types: [python]
|
||||||
pass_filenames: false
|
additional_dependencies: [regex]
|
||||||
additional_dependencies: [pathspec, regex]
|
|
||||||
- id: validate-config
|
- id: validate-config
|
||||||
name: Validate configuration has default values and that each field has a docstring
|
name: Validate configuration has default values and that each field has a docstring
|
||||||
entry: python tools/validate_config.py
|
entry: python tools/validate_config.py
|
||||||
language: python
|
language: python
|
||||||
types: [python]
|
additional_dependencies: [regex]
|
||||||
pass_filenames: true
|
|
||||||
files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py
|
|
||||||
# Keep `suggestion` last
|
# Keep `suggestion` last
|
||||||
- id: suggestion
|
- id: suggestion
|
||||||
name: Suggestion
|
name: Suggestion
|
||||||
|
|||||||
@ -13,6 +13,7 @@ build:
|
|||||||
|
|
||||||
mkdocs:
|
mkdocs:
|
||||||
configuration: mkdocs.yaml
|
configuration: mkdocs.yaml
|
||||||
|
fail_on_warning: true
|
||||||
|
|
||||||
# Optionally declare the Python requirements required to build your docs
|
# Optionally declare the Python requirements required to build your docs
|
||||||
python:
|
python:
|
||||||
|
|||||||
@ -1 +1,2 @@
|
|||||||
collect_env.py
|
collect_env.py
|
||||||
|
vllm/model_executor/layers/fla/ops/*.py
|
||||||
|
|||||||
174
CMakeLists.txt
174
CMakeLists.txt
@ -13,6 +13,10 @@ cmake_minimum_required(VERSION 3.26)
|
|||||||
# cmake --install . --component _C
|
# cmake --install . --component _C
|
||||||
project(vllm_extensions LANGUAGES CXX)
|
project(vllm_extensions LANGUAGES CXX)
|
||||||
|
|
||||||
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
|
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||||
|
|
||||||
|
|
||||||
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
||||||
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
|
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
|
||||||
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
||||||
@ -30,10 +34,10 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
|||||||
# Supported python versions. These versions will be searched in order, the
|
# Supported python versions. These versions will be searched in order, the
|
||||||
# first match will be selected. These should be kept in sync with setup.py.
|
# first match will be selected. These should be kept in sync with setup.py.
|
||||||
#
|
#
|
||||||
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
|
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
|
||||||
|
|
||||||
# Supported AMD GPU architectures.
|
# Supported AMD GPU architectures.
|
||||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
|
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Supported/expected torch versions for CUDA/ROCm.
|
# Supported/expected torch versions for CUDA/ROCm.
|
||||||
@ -45,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
|
|||||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||||
# versions are derived from docker/Dockerfile.rocm
|
# versions are derived from docker/Dockerfile.rocm
|
||||||
#
|
#
|
||||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.7.1")
|
set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
|
||||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
|
set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Try to find python package with an executable that exactly matches
|
# Try to find python package with an executable that exactly matches
|
||||||
@ -82,6 +86,9 @@ find_package(Torch REQUIRED)
|
|||||||
# Supported NVIDIA architectures.
|
# Supported NVIDIA architectures.
|
||||||
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
|
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
|
||||||
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
||||||
|
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
|
||||||
|
set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0")
|
||||||
|
elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
||||||
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
|
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
|
||||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
||||||
else()
|
else()
|
||||||
@ -171,6 +178,25 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
|
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Set compression mode for CUDA >=13.x.
|
||||||
|
#
|
||||||
|
if(VLLM_GPU_LANG STREQUAL "CUDA" AND
|
||||||
|
DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
||||||
|
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
|
||||||
|
list(APPEND VLLM_GPU_FLAGS "--compress-mode=size")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Set CUDA include flags for CXX compiler.
|
||||||
|
#
|
||||||
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include")
|
||||||
|
if(CUDA_VERSION VERSION_GREATER_EQUAL 13.0)
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include/cccl")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
|
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
|
||||||
# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
|
# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
|
||||||
@ -243,8 +269,8 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/sampler.cu"
|
"csrc/sampler.cu"
|
||||||
"csrc/cuda_view.cu"
|
"csrc/cuda_view.cu"
|
||||||
"csrc/quantization/gptq/q_gemm.cu"
|
"csrc/quantization/gptq/q_gemm.cu"
|
||||||
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
"csrc/quantization/w8a8/int8/scaled_quant.cu"
|
||||||
"csrc/quantization/fp8/common.cu"
|
"csrc/quantization/w8a8/fp8/common.cu"
|
||||||
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
||||||
"csrc/quantization/gguf/gguf_kernel.cu"
|
"csrc/quantization/gguf/gguf_kernel.cu"
|
||||||
"csrc/quantization/activation_kernels.cu"
|
"csrc/quantization/activation_kernels.cu"
|
||||||
@ -256,7 +282,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
||||||
|
|
||||||
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
||||||
set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
|
set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
|
||||||
|
|
||||||
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
||||||
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
||||||
@ -288,14 +314,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
list(APPEND VLLM_EXT_SRC
|
list(APPEND VLLM_EXT_SRC
|
||||||
"csrc/quantization/awq/gemm_kernels.cu"
|
"csrc/quantization/awq/gemm_kernels.cu"
|
||||||
"csrc/permute_cols.cu"
|
"csrc/permute_cols.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
"csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
|
|
||||||
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
||||||
"csrc/cutlass_extensions/common.cpp"
|
"csrc/cutlass_extensions/common.cpp"
|
||||||
"csrc/attention/mla/cutlass_mla_entry.cu"
|
"csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
|
||||||
"csrc/quantization/fp8/per_token_group_quant.cu")
|
"csrc/quantization/w8a8/int8/per_token_group_quant.cu")
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${VLLM_EXT_SRC}"
|
SRCS "${VLLM_EXT_SRC}"
|
||||||
@ -357,9 +382,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
|
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
|
||||||
|
|
||||||
set(MARLIN_SRCS
|
set(MARLIN_SRCS
|
||||||
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
|
|
||||||
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
||||||
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
|
|
||||||
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
||||||
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
||||||
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
|
"csrc/quantization/gptq_marlin/awq_marlin_repack.cu")
|
||||||
@ -401,11 +424,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
|
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
@ -429,12 +452,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
|
|
||||||
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
|
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
|
||||||
# CUDA 12.8 or later
|
# CUDA 12.8 or later
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}")
|
||||||
|
else()
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}")
|
||||||
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
|
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu"
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu"
|
||||||
)
|
)
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
@ -459,12 +486,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
|
|
||||||
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
|
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
|
||||||
# require CUDA 12.8 or later
|
# require CUDA 12.8 or later
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||||
|
else()
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||||
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu"
|
||||||
)
|
)
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
@ -495,7 +526,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# subtract out the archs that are already built for 3x
|
# subtract out the archs that are already built for 3x
|
||||||
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
||||||
if (SCALED_MM_2X_ARCHS)
|
if (SCALED_MM_2X_ARCHS)
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
|
set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
|
||||||
@ -539,10 +570,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
|
|
||||||
# The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
|
# The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
|
||||||
# CUDA 12.8 or later
|
# CUDA 12.8 or later
|
||||||
cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||||
|
cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}")
|
||||||
|
else()
|
||||||
|
cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}")
|
||||||
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||||
|
"csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
|
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
@ -557,10 +593,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
# FP4 Archs and flags
|
# FP4 Archs and flags
|
||||||
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||||
|
cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||||
|
else()
|
||||||
|
cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||||
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||||
|
"csrc/quantization/fp4/activation_nvfp4_quant_fusion_kernels.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
|
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
|
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
|
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
|
||||||
@ -578,10 +619,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
# CUTLASS MLA Archs and flags
|
# CUTLASS MLA Archs and flags
|
||||||
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||||
|
cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||||
|
else()
|
||||||
|
cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||||
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/attention/mla/cutlass_mla_kernels.cu"
|
|
||||||
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
|
"csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
@ -605,7 +649,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# if it's possible to compile MoE kernels that use its output.
|
# if it's possible to compile MoE kernels that use its output.
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu")
|
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
@ -623,9 +667,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
|
||||||
|
else()
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
|
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
@ -644,9 +692,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
# moe_data.cu is used by all CUTLASS MoE kernels.
|
# moe_data.cu is used by all CUTLASS MoE kernels.
|
||||||
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||||
|
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||||
|
else()
|
||||||
|
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||||
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
|
set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
|
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
|
||||||
@ -663,9 +715,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
|
||||||
|
else()
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||||
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
|
set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
@ -752,6 +808,44 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
"found in CUDA target architectures")
|
"found in CUDA target architectures")
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
# Only build W4A8 kernels if we are building for something compatible with sm90a
|
||||||
|
cuda_archs_loose_intersection(W4A8_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND W4A8_ARCHS)
|
||||||
|
set(SRCS
|
||||||
|
"csrc/quantization/cutlass_w4a8/w4a8_mm_entry.cu")
|
||||||
|
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${SRCS}"
|
||||||
|
CUDA_ARCHS "${W4A8_ARCHS}")
|
||||||
|
|
||||||
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
|
|
||||||
|
message(STATUS "Building W4A8 kernels for archs: ${W4A8_ARCHS}")
|
||||||
|
else()
|
||||||
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0
|
||||||
|
AND W4A8_ARCHS)
|
||||||
|
message(STATUS "Not building W4A8 kernels as CUDA Compiler version is "
|
||||||
|
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
|
||||||
|
"later if you intend on running w4a16 quantized models on "
|
||||||
|
"Hopper.")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building W4A8 kernels as no compatible archs "
|
||||||
|
"found in CUDA target architectures")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# Hadacore kernels
|
||||||
|
cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
|
||||||
|
if(HADACORE_ARCHS)
|
||||||
|
set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${SRCS}"
|
||||||
|
CUDA_ARCHS "${HADACORE_ARCHS}")
|
||||||
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
|
message(STATUS "Building hadacore")
|
||||||
|
endif()
|
||||||
|
|
||||||
# if CUDA endif
|
# if CUDA endif
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
@ -792,7 +886,9 @@ set(VLLM_MOE_EXT_SRC
|
|||||||
"csrc/moe/topk_softmax_kernels.cu")
|
"csrc/moe/topk_softmax_kernels.cu")
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
|
list(APPEND VLLM_MOE_EXT_SRC
|
||||||
|
"csrc/moe/moe_wna16.cu"
|
||||||
|
"csrc/moe/grouped_topk_kernels.cu")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
|
|||||||
@ -2,7 +2,6 @@ include LICENSE
|
|||||||
include requirements/common.txt
|
include requirements/common.txt
|
||||||
include requirements/cuda.txt
|
include requirements/cuda.txt
|
||||||
include requirements/rocm.txt
|
include requirements/rocm.txt
|
||||||
include requirements/neuron.txt
|
|
||||||
include requirements/cpu.txt
|
include requirements/cpu.txt
|
||||||
include CMakeLists.txt
|
include CMakeLists.txt
|
||||||
|
|
||||||
|
|||||||
14
README.md
14
README.md
@ -14,18 +14,26 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://blog.vllm.ai/"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
|
---
|
||||||
|
Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundation.org/pytorch-conference/) and [Ray Summit, November 3-5](https://www.anyscale.com/ray-summit/2025) in San Francisco for our latest updates on vLLM and to meet the vLLM team! Register now for the largest vLLM community events of the year!
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
|
|
||||||
- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
|
- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
|
||||||
- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
|
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
|
||||||
|
- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
|
||||||
|
- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
|
||||||
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
|
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
|
||||||
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary>Previous News</summary>
|
<summary>Previous News</summary>
|
||||||
|
|
||||||
|
- [2025/08] We hosted [vLLM Korea Meetup](https://luma.com/cgcgprmh) with Red Hat and Rebellions! We shared the latest advancements in vLLM along with project spotlights from the vLLM Korea community. Please find the meetup slides [here](https://drive.google.com/file/d/1bcrrAE1rxUgx0mjIeOWT6hNe2RefC5Hm/view).
|
||||||
|
- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
|
||||||
|
- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
|
||||||
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
|
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
|
||||||
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
|
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
|
||||||
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
|
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
|
||||||
@ -74,7 +82,7 @@ vLLM is flexible and easy to use with:
|
|||||||
- Tensor, pipeline, data and expert parallelism support for distributed inference
|
- Tensor, pipeline, data and expert parallelism support for distributed inference
|
||||||
- Streaming outputs
|
- Streaming outputs
|
||||||
- OpenAI-compatible API server
|
- OpenAI-compatible API server
|
||||||
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron
|
- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
|
||||||
- Prefix caching support
|
- Prefix caching support
|
||||||
- Multi-LoRA support
|
- Multi-LoRA support
|
||||||
|
|
||||||
|
|||||||
@ -42,4 +42,9 @@ For certain security issues of CRITICAL, HIGH, or MODERATE severity level, we ma
|
|||||||
|
|
||||||
* If you wish to be added to the prenotification group, please send an email copying all the members of the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). Each vendor contact will be analyzed on a case-by-case basis.
|
* If you wish to be added to the prenotification group, please send an email copying all the members of the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). Each vendor contact will be analyzed on a case-by-case basis.
|
||||||
|
|
||||||
|
* Organizations and vendors who either ship or use vLLM, are eligible to join the prenotification group if they meet at least one of the following qualifications
|
||||||
|
* Substantial internal deployment leveraging the upstream vLLM project.
|
||||||
|
* Established internal security teams and comprehensive compliance measures.
|
||||||
|
* Active and consistent contributions to the upstream vLLM project.
|
||||||
|
|
||||||
* We may withdraw organizations from receiving future prenotifications if they release fixes or any other information about issues before they are public. Group membership may also change based on policy refinements for who may be included.
|
* We may withdraw organizations from receiving future prenotifications if they release fixes or any other information about issues before they are public. Group membership may also change based on policy refinements for who may be included.
|
||||||
|
|||||||
@ -1,687 +1,20 @@
|
|||||||
# Benchmarking vLLM
|
# Benchmarks
|
||||||
|
|
||||||
This README guides you through running benchmark tests with the extensive
|
This directory used to contain vLLM's benchmark scripts and utilities for performance testing and evaluation.
|
||||||
datasets supported on vLLM. It’s a living document, updated as new features and datasets
|
|
||||||
become available.
|
|
||||||
|
|
||||||
## Dataset Overview
|
## Contents
|
||||||
|
|
||||||
<table style="width:100%; border-collapse: collapse;">
|
- **Serving benchmarks**: Scripts for testing online inference performance (latency, throughput)
|
||||||
<thead>
|
- **Throughput benchmarks**: Scripts for testing offline batch inference performance
|
||||||
<tr>
|
- **Specialized benchmarks**: Tools for testing specific features like structured output, prefix caching, long document QA, request prioritization, and multi-modal inference
|
||||||
<th style="width:15%; text-align: left;">Dataset</th>
|
- **Dataset utilities**: Framework for loading and sampling from various benchmark datasets (ShareGPT, HuggingFace datasets, synthetic data, etc.)
|
||||||
<th style="width:10%; text-align: center;">Online</th>
|
|
||||||
<th style="width:10%; text-align: center;">Offline</th>
|
|
||||||
<th style="width:65%; text-align: left;">Data Path</th>
|
|
||||||
</tr>
|
|
||||||
</thead>
|
|
||||||
<tbody>
|
|
||||||
<tr>
|
|
||||||
<td><strong>ShareGPT</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>ShareGPT4V (Image)</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td>
|
|
||||||
<code>wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json</code>
|
|
||||||
<br>
|
|
||||||
<div>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:</div>
|
|
||||||
<code>wget http://images.cocodataset.org/zips/train2017.zip</code>
|
|
||||||
</td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>BurstGPT</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>Sonnet (deprecated)</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td>Local file: <code>benchmarks/sonnet.txt</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>Random</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>synthetic</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>Prefix Repetition</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>synthetic</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>HuggingFace-VisionArena</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>lmarena-ai/VisionArena-Chat</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>HuggingFace-InstructCoder</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>likaixin/InstructCoder</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>HuggingFace-AIMO</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>HuggingFace-Other</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
|
|
||||||
</tr>
|
|
||||||
<tr>
|
|
||||||
<td><strong>Custom</strong></td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td style="text-align: center;">✅</td>
|
|
||||||
<td>Local file: <code>data.jsonl</code></td>
|
|
||||||
</tr>
|
|
||||||
</tbody>
|
|
||||||
</table>
|
|
||||||
|
|
||||||
✅: supported
|
## Usage
|
||||||
|
|
||||||
🟡: Partial support
|
For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/contributing/benchmarks.html#benchmark-cli).
|
||||||
|
|
||||||
🚧: to be supported
|
For full CLI reference see:
|
||||||
|
|
||||||
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
|
- <https://docs.vllm.ai/en/latest/cli/bench/latency.html>
|
||||||
|
- <https://docs.vllm.ai/en/latest/cli/bench/serve.html>
|
||||||
## 🚀 Example - Online Benchmark
|
- <https://docs.vllm.ai/en/latest/cli/bench/throughput.html>
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>Show more</summary>
|
|
||||||
|
|
||||||
<br/>
|
|
||||||
|
|
||||||
First start serving your model
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm serve NousResearch/Hermes-3-Llama-3.1-8B
|
|
||||||
```
|
|
||||||
|
|
||||||
Then run the benchmarking script
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# download dataset
|
|
||||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
|
||||||
vllm bench serve \
|
|
||||||
--backend vllm \
|
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
|
||||||
--endpoint /v1/completions \
|
|
||||||
--dataset-name sharegpt \
|
|
||||||
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
If successful, you will see the following output
|
|
||||||
|
|
||||||
```text
|
|
||||||
============ Serving Benchmark Result ============
|
|
||||||
Successful requests: 10
|
|
||||||
Benchmark duration (s): 5.78
|
|
||||||
Total input tokens: 1369
|
|
||||||
Total generated tokens: 2212
|
|
||||||
Request throughput (req/s): 1.73
|
|
||||||
Output token throughput (tok/s): 382.89
|
|
||||||
Total Token throughput (tok/s): 619.85
|
|
||||||
---------------Time to First Token----------------
|
|
||||||
Mean TTFT (ms): 71.54
|
|
||||||
Median TTFT (ms): 73.88
|
|
||||||
P99 TTFT (ms): 79.49
|
|
||||||
-----Time per Output Token (excl. 1st token)------
|
|
||||||
Mean TPOT (ms): 7.91
|
|
||||||
Median TPOT (ms): 7.96
|
|
||||||
P99 TPOT (ms): 8.03
|
|
||||||
---------------Inter-token Latency----------------
|
|
||||||
Mean ITL (ms): 7.74
|
|
||||||
Median ITL (ms): 7.70
|
|
||||||
P99 ITL (ms): 8.39
|
|
||||||
==================================================
|
|
||||||
```
|
|
||||||
|
|
||||||
### Custom Dataset
|
|
||||||
|
|
||||||
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
|
|
||||||
|
|
||||||
```json
|
|
||||||
{"prompt": "What is the capital of India?"}
|
|
||||||
{"prompt": "What is the capital of Iran?"}
|
|
||||||
{"prompt": "What is the capital of China?"}
|
|
||||||
```
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# start server
|
|
||||||
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct
|
|
||||||
```
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# run benchmarking script
|
|
||||||
vllm bench serve --port 9001 --save-result --save-detailed \
|
|
||||||
--backend vllm \
|
|
||||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
|
||||||
--endpoint /v1/completions \
|
|
||||||
--dataset-name custom \
|
|
||||||
--dataset-path <path-to-your-data-jsonl> \
|
|
||||||
--custom-skip-chat-template \
|
|
||||||
--num-prompts 80 \
|
|
||||||
--max-concurrency 1 \
|
|
||||||
--temperature=0.3 \
|
|
||||||
--top-p=0.75 \
|
|
||||||
--result-dir "./log/"
|
|
||||||
```
|
|
||||||
|
|
||||||
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
|
|
||||||
|
|
||||||
### VisionArena Benchmark for Vision Language Models
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# need a model with vision capability here
|
|
||||||
vllm serve Qwen/Qwen2-VL-7B-Instruct
|
|
||||||
```
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm bench serve \
|
|
||||||
--backend openai-chat \
|
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
|
||||||
--endpoint /v1/chat/completions \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path lmarena-ai/VisionArena-Chat \
|
|
||||||
--hf-split train \
|
|
||||||
--num-prompts 1000
|
|
||||||
```
|
|
||||||
|
|
||||||
### InstructCoder Benchmark with Speculative Decoding
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
|
||||||
--speculative-config $'{"method": "ngram",
|
|
||||||
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
|
||||||
"prompt_lookup_min": 2}'
|
|
||||||
```
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
vllm bench serve \
|
|
||||||
--model meta-llama/Meta-Llama-3-8B-Instruct \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path likaixin/InstructCoder \
|
|
||||||
--num-prompts 2048
|
|
||||||
```
|
|
||||||
|
|
||||||
### Other HuggingFaceDataset Examples
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm serve Qwen/Qwen2-VL-7B-Instruct
|
|
||||||
```
|
|
||||||
|
|
||||||
`lmms-lab/LLaVA-OneVision-Data`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm bench serve \
|
|
||||||
--backend openai-chat \
|
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
|
||||||
--endpoint /v1/chat/completions \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path lmms-lab/LLaVA-OneVision-Data \
|
|
||||||
--hf-split train \
|
|
||||||
--hf-subset "chart2text(cauldron)" \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
`Aeala/ShareGPT_Vicuna_unfiltered`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm bench serve \
|
|
||||||
--backend openai-chat \
|
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
|
||||||
--endpoint /v1/chat/completions \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
|
|
||||||
--hf-split train \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
`AI-MO/aimo-validation-aime`:
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
vllm bench serve \
|
|
||||||
--model Qwen/QwQ-32B \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path AI-MO/aimo-validation-aime \
|
|
||||||
--num-prompts 10 \
|
|
||||||
--seed 42
|
|
||||||
```
|
|
||||||
|
|
||||||
`philschmid/mt-bench`:
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
vllm bench serve \
|
|
||||||
--model Qwen/QwQ-32B \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path philschmid/mt-bench \
|
|
||||||
--num-prompts 80
|
|
||||||
```
|
|
||||||
|
|
||||||
### Running With Sampling Parameters
|
|
||||||
|
|
||||||
When using OpenAI-compatible backends such as `vllm`, optional sampling
|
|
||||||
parameters can be specified. Example client command:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm bench serve \
|
|
||||||
--backend vllm \
|
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
|
||||||
--endpoint /v1/completions \
|
|
||||||
--dataset-name sharegpt \
|
|
||||||
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
|
||||||
--top-k 10 \
|
|
||||||
--top-p 0.9 \
|
|
||||||
--temperature 0.5 \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
### Running With Ramp-Up Request Rate
|
|
||||||
|
|
||||||
The benchmark tool also supports ramping up the request rate over the
|
|
||||||
duration of the benchmark run. This can be useful for stress testing the
|
|
||||||
server or finding the maximum throughput that it can handle, given some latency budget.
|
|
||||||
|
|
||||||
Two ramp-up strategies are supported:
|
|
||||||
|
|
||||||
- `linear`: Increases the request rate linearly from a start value to an end value.
|
|
||||||
- `exponential`: Increases the request rate exponentially.
|
|
||||||
|
|
||||||
The following arguments can be used to control the ramp-up:
|
|
||||||
|
|
||||||
- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
|
|
||||||
- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
|
|
||||||
- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
## 📈 Example - Offline Throughput Benchmark
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>Show more</summary>
|
|
||||||
|
|
||||||
<br/>
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm bench throughput \
|
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
|
||||||
--dataset-name sonnet \
|
|
||||||
--dataset-path vllm/benchmarks/sonnet.txt \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
If successful, you will see the following output
|
|
||||||
|
|
||||||
```text
|
|
||||||
Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
|
|
||||||
Total num prompt tokens: 5014
|
|
||||||
Total num output tokens: 1500
|
|
||||||
```
|
|
||||||
|
|
||||||
### VisionArena Benchmark for Vision Language Models
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm bench throughput \
|
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
|
||||||
--backend vllm-chat \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path lmarena-ai/VisionArena-Chat \
|
|
||||||
--num-prompts 1000 \
|
|
||||||
--hf-split train
|
|
||||||
```
|
|
||||||
|
|
||||||
The `num prompt tokens` now includes image token counts
|
|
||||||
|
|
||||||
```text
|
|
||||||
Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
|
|
||||||
Total num prompt tokens: 14527
|
|
||||||
Total num output tokens: 1280
|
|
||||||
```
|
|
||||||
|
|
||||||
### InstructCoder Benchmark with Speculative Decoding
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
|
||||||
VLLM_USE_V1=1 \
|
|
||||||
vllm bench throughput \
|
|
||||||
--dataset-name=hf \
|
|
||||||
--dataset-path=likaixin/InstructCoder \
|
|
||||||
--model=meta-llama/Meta-Llama-3-8B-Instruct \
|
|
||||||
--input-len=1000 \
|
|
||||||
--output-len=100 \
|
|
||||||
--num-prompts=2048 \
|
|
||||||
--async-engine \
|
|
||||||
--speculative-config $'{"method": "ngram",
|
|
||||||
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
|
||||||
"prompt_lookup_min": 2}'
|
|
||||||
```
|
|
||||||
|
|
||||||
```text
|
|
||||||
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
|
|
||||||
Total num prompt tokens: 261136
|
|
||||||
Total num output tokens: 204800
|
|
||||||
```
|
|
||||||
|
|
||||||
### Other HuggingFaceDataset Examples
|
|
||||||
|
|
||||||
`lmms-lab/LLaVA-OneVision-Data`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm bench throughput \
|
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
|
||||||
--backend vllm-chat \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path lmms-lab/LLaVA-OneVision-Data \
|
|
||||||
--hf-split train \
|
|
||||||
--hf-subset "chart2text(cauldron)" \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
`Aeala/ShareGPT_Vicuna_unfiltered`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm bench throughput \
|
|
||||||
--model Qwen/Qwen2-VL-7B-Instruct \
|
|
||||||
--backend vllm-chat \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
|
|
||||||
--hf-split train \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
`AI-MO/aimo-validation-aime`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm bench throughput \
|
|
||||||
--model Qwen/QwQ-32B \
|
|
||||||
--backend vllm \
|
|
||||||
--dataset-name hf \
|
|
||||||
--dataset-path AI-MO/aimo-validation-aime \
|
|
||||||
--hf-split train \
|
|
||||||
--num-prompts 10
|
|
||||||
```
|
|
||||||
|
|
||||||
Benchmark with LoRA adapters:
|
|
||||||
|
|
||||||
``` bash
|
|
||||||
# download dataset
|
|
||||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
|
||||||
vllm bench throughput \
|
|
||||||
--model meta-llama/Llama-2-7b-hf \
|
|
||||||
--backend vllm \
|
|
||||||
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
|
||||||
--dataset_name sharegpt \
|
|
||||||
--num-prompts 10 \
|
|
||||||
--max-loras 2 \
|
|
||||||
--max-lora-rank 8 \
|
|
||||||
--enable-lora \
|
|
||||||
--lora-path yard1/llama-2-7b-sql-lora-test
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
## 🛠️ Example - Structured Output Benchmark
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>Show more</summary>
|
|
||||||
|
|
||||||
<br/>
|
|
||||||
|
|
||||||
Benchmark the performance of structured output generation (JSON, grammar, regex).
|
|
||||||
|
|
||||||
### Server Setup
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm serve NousResearch/Hermes-3-Llama-3.1-8B
|
|
||||||
```
|
|
||||||
|
|
||||||
### JSON Schema Benchmark
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
|
||||||
--backend vllm \
|
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
|
||||||
--dataset json \
|
|
||||||
--structured-output-ratio 1.0 \
|
|
||||||
--request-rate 10 \
|
|
||||||
--num-prompts 1000
|
|
||||||
```
|
|
||||||
|
|
||||||
### Grammar-based Generation Benchmark
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
|
||||||
--backend vllm \
|
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
|
||||||
--dataset grammar \
|
|
||||||
--structure-type grammar \
|
|
||||||
--request-rate 10 \
|
|
||||||
--num-prompts 1000
|
|
||||||
```
|
|
||||||
|
|
||||||
### Regex-based Generation Benchmark
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
|
||||||
--backend vllm \
|
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
|
||||||
--dataset regex \
|
|
||||||
--request-rate 10 \
|
|
||||||
--num-prompts 1000
|
|
||||||
```
|
|
||||||
|
|
||||||
### Choice-based Generation Benchmark
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
|
||||||
--backend vllm \
|
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
|
||||||
--dataset choice \
|
|
||||||
--request-rate 10 \
|
|
||||||
--num-prompts 1000
|
|
||||||
```
|
|
||||||
|
|
||||||
### XGrammar Benchmark Dataset
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
|
||||||
--backend vllm \
|
|
||||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
|
||||||
--dataset xgrammar_bench \
|
|
||||||
--request-rate 10 \
|
|
||||||
--num-prompts 1000
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
## 📚 Example - Long Document QA Benchmark
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>Show more</summary>
|
|
||||||
|
|
||||||
<br/>
|
|
||||||
|
|
||||||
Benchmark the performance of long document question-answering with prefix caching.
|
|
||||||
|
|
||||||
### Basic Long Document QA Test
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
|
||||||
--model meta-llama/Llama-2-7b-chat-hf \
|
|
||||||
--enable-prefix-caching \
|
|
||||||
--num-documents 16 \
|
|
||||||
--document-length 2000 \
|
|
||||||
--output-len 50 \
|
|
||||||
--repeat-count 5
|
|
||||||
```
|
|
||||||
|
|
||||||
### Different Repeat Modes
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Random mode (default) - shuffle prompts randomly
|
|
||||||
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
|
||||||
--model meta-llama/Llama-2-7b-chat-hf \
|
|
||||||
--enable-prefix-caching \
|
|
||||||
--num-documents 8 \
|
|
||||||
--document-length 3000 \
|
|
||||||
--repeat-count 3 \
|
|
||||||
--repeat-mode random
|
|
||||||
|
|
||||||
# Tile mode - repeat entire prompt list in sequence
|
|
||||||
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
|
||||||
--model meta-llama/Llama-2-7b-chat-hf \
|
|
||||||
--enable-prefix-caching \
|
|
||||||
--num-documents 8 \
|
|
||||||
--document-length 3000 \
|
|
||||||
--repeat-count 3 \
|
|
||||||
--repeat-mode tile
|
|
||||||
|
|
||||||
# Interleave mode - repeat each prompt consecutively
|
|
||||||
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
|
||||||
--model meta-llama/Llama-2-7b-chat-hf \
|
|
||||||
--enable-prefix-caching \
|
|
||||||
--num-documents 8 \
|
|
||||||
--document-length 3000 \
|
|
||||||
--repeat-count 3 \
|
|
||||||
--repeat-mode interleave
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
## 🗂️ Example - Prefix Caching Benchmark
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>Show more</summary>
|
|
||||||
|
|
||||||
<br/>
|
|
||||||
|
|
||||||
Benchmark the efficiency of automatic prefix caching.
|
|
||||||
|
|
||||||
### Fixed Prompt with Prefix Caching
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 benchmarks/benchmark_prefix_caching.py \
|
|
||||||
--model meta-llama/Llama-2-7b-chat-hf \
|
|
||||||
--enable-prefix-caching \
|
|
||||||
--num-prompts 1 \
|
|
||||||
--repeat-count 100 \
|
|
||||||
--input-length-range 128:256
|
|
||||||
```
|
|
||||||
|
|
||||||
### ShareGPT Dataset with Prefix Caching
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# download dataset
|
|
||||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
|
||||||
|
|
||||||
python3 benchmarks/benchmark_prefix_caching.py \
|
|
||||||
--model meta-llama/Llama-2-7b-chat-hf \
|
|
||||||
--dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
|
|
||||||
--enable-prefix-caching \
|
|
||||||
--num-prompts 20 \
|
|
||||||
--repeat-count 5 \
|
|
||||||
--input-length-range 128:256
|
|
||||||
```
|
|
||||||
|
|
||||||
### Prefix Repetition Dataset
|
|
||||||
|
|
||||||
```bash
|
|
||||||
vllm bench serve \
|
|
||||||
--backend openai \
|
|
||||||
--model meta-llama/Llama-2-7b-chat-hf \
|
|
||||||
--dataset-name prefix_repetition \
|
|
||||||
--num-prompts 100 \
|
|
||||||
--prefix-repetition-prefix-len 512 \
|
|
||||||
--prefix-repetition-suffix-len 128 \
|
|
||||||
--prefix-repetition-num-prefixes 5 \
|
|
||||||
--prefix-repetition-output-len 128
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
## ⚡ Example - Request Prioritization Benchmark
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>Show more</summary>
|
|
||||||
|
|
||||||
<br/>
|
|
||||||
|
|
||||||
Benchmark the performance of request prioritization in vLLM.
|
|
||||||
|
|
||||||
### Basic Prioritization Test
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 benchmarks/benchmark_prioritization.py \
|
|
||||||
--model meta-llama/Llama-2-7b-chat-hf \
|
|
||||||
--input-len 128 \
|
|
||||||
--output-len 64 \
|
|
||||||
--num-prompts 100 \
|
|
||||||
--scheduling-policy priority
|
|
||||||
```
|
|
||||||
|
|
||||||
### Multiple Sequences per Prompt
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python3 benchmarks/benchmark_prioritization.py \
|
|
||||||
--model meta-llama/Llama-2-7b-chat-hf \
|
|
||||||
--input-len 128 \
|
|
||||||
--output-len 64 \
|
|
||||||
--num-prompts 100 \
|
|
||||||
--scheduling-policy priority \
|
|
||||||
--n 2
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|
||||||
## 👁️ Example - Multi-Modal Benchmark
|
|
||||||
|
|
||||||
<details>
|
|
||||||
<summary>Show more</summary>
|
|
||||||
|
|
||||||
<br/>
|
|
||||||
|
|
||||||
Benchmark the performance of multi-modal requests in vLLM.
|
|
||||||
|
|
||||||
### Images (ShareGPT4V)
|
|
||||||
|
|
||||||
Start vLLM:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python -m vllm.entrypoints.openai.api_server \
|
|
||||||
--model Qwen/Qwen2.5-VL-7B-Instruct \
|
|
||||||
--dtype bfloat16 \
|
|
||||||
--limit-mm-per-prompt '{"image": 1}' \
|
|
||||||
--allowed-local-media-path /path/to/sharegpt4v/images
|
|
||||||
```
|
|
||||||
|
|
||||||
Send requests with images:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
python benchmarks/benchmark_serving.py \
|
|
||||||
--backend openai-chat \
|
|
||||||
--model Qwen/Qwen2.5-VL-7B-Instruct \
|
|
||||||
--dataset-name sharegpt \
|
|
||||||
--dataset-path /path/to/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.json \
|
|
||||||
--num-prompts 100 \
|
|
||||||
--save-result \
|
|
||||||
--result-dir ~/vllm_benchmark_results \
|
|
||||||
--save-detailed \
|
|
||||||
--endpoint /v1/chat/completion
|
|
||||||
```
|
|
||||||
|
|
||||||
</details>
|
|
||||||
|
|||||||
@ -31,6 +31,12 @@ cd vllm
|
|||||||
|
|
||||||
You must set the following variables at the top of the script before execution.
|
You must set the following variables at the top of the script before execution.
|
||||||
|
|
||||||
|
Note: You can also override the default values below via environment variables when running the script.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
MODEL=meta-llama/Llama-3.3-70B-Instruct SYSTEM=TPU TP=8 DOWNLOAD_DIR='' INPUT_LEN=128 OUTPUT_LEN=2048 MAX_MODEL_LEN=2300 MIN_CACHE_HIT_PCT=0 MAX_LATENCY_ALLOWED_MS=100000000000 NUM_SEQS_LIST="128 256" NUM_BATCHED_TOKENS_LIST="1024 2048 4096" VLLM_LOGGING_LEVEL=DEBUG bash auto_tune.sh
|
||||||
|
```
|
||||||
|
|
||||||
| Variable | Description | Example Value |
|
| Variable | Description | Example Value |
|
||||||
| --- | --- | --- |
|
| --- | --- | --- |
|
||||||
| `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
|
| `BASE` | **Required.** The absolute path to the parent directory of your vLLM repository directory. | `"$HOME"` |
|
||||||
@ -143,3 +149,70 @@ The script follows a systematic process to find the optimal parameters:
|
|||||||
4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far.
|
4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far.
|
||||||
|
|
||||||
5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard.
|
5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard.
|
||||||
|
|
||||||
|
## Batched `auto_tune`
|
||||||
|
|
||||||
|
The `batch_auto_tune.sh` script allows you to run multiple `auto_tune.sh` experiments sequentially from a single configuration file. It iterates through a list of parameter sets, executes `auto_tune.sh` for each, and records the results back into the input file.
|
||||||
|
|
||||||
|
### Prerequisites
|
||||||
|
|
||||||
|
- **jq**: This script requires `jq` to parse the JSON configuration file.
|
||||||
|
- **gcloud**: If you plan to upload results to Google Cloud Storage, the `gcloud` CLI must be installed and authenticated.
|
||||||
|
|
||||||
|
### How to Run
|
||||||
|
|
||||||
|
1. **Create a JSON configuration file**: Create a file (e.g., `runs_config.json`) containing an array of JSON objects. Each object defines the parameters for a single `auto_tune.sh` run.
|
||||||
|
|
||||||
|
2. **Execute the script**:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
bash batch_auto_tune.sh <path_to_json_file> [gcs_upload_path]
|
||||||
|
```
|
||||||
|
|
||||||
|
- `<path_to_json_file>`: **Required.** Path to your JSON configuration file.
|
||||||
|
- `[gcs_upload_path]`: **Optional.** A GCS path (e.g., `gs://my-bucket/benchmark-results`) where the detailed results and profiles for each run will be uploaded. If this is empty, the results will be available on the local filesystem (see the log for `RESULT_FILE=/path/to/results/file.txt`).
|
||||||
|
|
||||||
|
### Configuration File
|
||||||
|
|
||||||
|
The JSON configuration file should contain an array of objects. Each object's keys correspond to the configuration variables for `auto_tune.sh` (see the [Configuration table above](#configuration)). These keys will be converted to uppercase environment variables for each run.
|
||||||
|
|
||||||
|
Here is an example `runs_config.json` with two benchmark configurations:
|
||||||
|
|
||||||
|
```json
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"base": "/home/user",
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"system": "TPU", # OR GPU
|
||||||
|
"tp": 8,
|
||||||
|
"input_len": 128,
|
||||||
|
"output_len": 2048,
|
||||||
|
"max_model_len": 2300,
|
||||||
|
"num_seqs_list": "128 256",
|
||||||
|
"num_batched_tokens_list": "8192 16384"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"base": "/home/user",
|
||||||
|
"model": "meta-llama/Llama-3.1-70B-Instruct",
|
||||||
|
"system": "TPU", # OR GPU
|
||||||
|
"tp": 8,
|
||||||
|
"input_len": 4000,
|
||||||
|
"output_len": 16,
|
||||||
|
"max_model_len": 4096,
|
||||||
|
"num_seqs_list": "64 128",
|
||||||
|
"num_batched_tokens_list": "4096 8192",
|
||||||
|
"max_latency_allowed_ms": 500
|
||||||
|
}
|
||||||
|
]
|
||||||
|
```
|
||||||
|
|
||||||
|
### Output
|
||||||
|
|
||||||
|
The script modifies the input JSON file in place, adding the results of each run to the corresponding object. The following fields are added:
|
||||||
|
|
||||||
|
- `run_id`: A unique identifier for the run, derived from the timestamp.
|
||||||
|
- `status`: The outcome of the run (`SUCCESS`, `FAILURE`, or `WARNING_NO_RESULT_FILE`).
|
||||||
|
- `results`: The content of the `result.txt` file from the `auto_tune.sh` run.
|
||||||
|
- `gcs_results`: The GCS URL where the run's artifacts are stored (if a GCS path was provided).
|
||||||
|
|
||||||
|
A summary of successful and failed runs is also printed to the console upon completion.
|
||||||
|
|||||||
@ -5,25 +5,41 @@
|
|||||||
|
|
||||||
TAG=$(date +"%Y_%m_%d_%H_%M")
|
TAG=$(date +"%Y_%m_%d_%H_%M")
|
||||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||||
BASE="$SCRIPT_DIR/../../.."
|
VLLM_LOGGING_LEVEL=${VLLM_LOGGING_LEVEL:-INFO}
|
||||||
MODEL="meta-llama/Llama-3.1-8B-Instruct"
|
BASE=${BASE:-"$SCRIPT_DIR/../../.."}
|
||||||
SYSTEM="TPU"
|
MODEL=${MODEL:-"meta-llama/Llama-3.1-8B-Instruct"}
|
||||||
TP=1
|
SYSTEM=${SYSTEM:-"TPU"}
|
||||||
DOWNLOAD_DIR=""
|
TP=${TP:-1}
|
||||||
INPUT_LEN=4000
|
DOWNLOAD_DIR=${DOWNLOAD_DIR:-""}
|
||||||
OUTPUT_LEN=16
|
INPUT_LEN=${INPUT_LEN:-4000}
|
||||||
MAX_MODEL_LEN=4096
|
OUTPUT_LEN=${OUTPUT_LEN:-16}
|
||||||
MIN_CACHE_HIT_PCT=0
|
MAX_MODEL_LEN=${MAX_MODEL_LEN:-4096}
|
||||||
MAX_LATENCY_ALLOWED_MS=100000000000
|
MIN_CACHE_HIT_PCT=${MIN_CACHE_HIT_PCT:-0}
|
||||||
NUM_SEQS_LIST="128 256"
|
MAX_LATENCY_ALLOWED_MS=${MAX_LATENCY_ALLOWED_MS:-100000000000}
|
||||||
NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
|
NUM_SEQS_LIST=${NUM_SEQS_LIST:-"128 256"}
|
||||||
|
NUM_BATCHED_TOKENS_LIST=${NUM_BATCHED_TOKENS_LIST:-"512 1024 2048 4096"}
|
||||||
|
|
||||||
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
|
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
|
||||||
RESULT="$LOG_FOLDER/result.txt"
|
RESULT="$LOG_FOLDER/result.txt"
|
||||||
PROFILE_PATH="$LOG_FOLDER/profile"
|
PROFILE_PATH="$LOG_FOLDER/profile"
|
||||||
|
|
||||||
echo "result file: $RESULT"
|
echo "====================== AUTO TUNE PARAMETERS ===================="
|
||||||
echo "model: $MODEL"
|
echo "SCRIPT_DIR=$SCRIPT_DIR"
|
||||||
|
echo "BASE=$BASE"
|
||||||
|
echo "MODEL=$MODEL"
|
||||||
|
echo "SYSTEM=$SYSTEM"
|
||||||
|
echo "TP=$TP"
|
||||||
|
echo "DOWNLOAD_DIR=$DOWNLOAD_DIR"
|
||||||
|
echo "INPUT_LEN=$INPUT_LEN"
|
||||||
|
echo "OUTPUT_LEN=$OUTPUT_LEN"
|
||||||
|
echo "MAX_MODEL_LEN=$MAX_MODEL_LEN"
|
||||||
|
echo "MIN_CACHE_HIT_PCT=$MIN_CACHE_HIT_PCT"
|
||||||
|
echo "MAX_LATENCY_ALLOWED_MS=$MAX_LATENCY_ALLOWED_MS"
|
||||||
|
echo "NUM_SEQS_LIST=$NUM_SEQS_LIST"
|
||||||
|
echo "NUM_BATCHED_TOKENS_LIST=$NUM_BATCHED_TOKENS_LIST"
|
||||||
|
echo "VLLM_LOGGING_LEVEL=$VLLM_LOGGING_LEVEL"
|
||||||
|
echo "RESULT_FILE=$RESULT"
|
||||||
|
echo "====================== AUTO TUNEPARAMETERS ===================="
|
||||||
|
|
||||||
rm -rf $LOG_FOLDER
|
rm -rf $LOG_FOLDER
|
||||||
rm -rf $PROFILE_PATH
|
rm -rf $PROFILE_PATH
|
||||||
@ -87,10 +103,15 @@ start_server() {
|
|||||||
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
|
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
|
||||||
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||||
fi
|
fi
|
||||||
|
local server_pid=$!
|
||||||
|
|
||||||
# wait for 10 minutes...
|
# wait for 10 minutes...
|
||||||
server_started=0
|
server_started=0
|
||||||
for i in {1..60}; do
|
for i in {1..60}; do
|
||||||
|
# This line checks whether the server is still alive or not,
|
||||||
|
# since that we should always have permission to send signal to the server process.
|
||||||
|
kill -0 $server_pid 2> /dev/null || break
|
||||||
|
|
||||||
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
|
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
|
||||||
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
||||||
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
||||||
@ -102,7 +123,7 @@ start_server() {
|
|||||||
done
|
done
|
||||||
|
|
||||||
if (( ! server_started )); then
|
if (( ! server_started )); then
|
||||||
echo "server did not start within 10 minutes. Please check server log at $vllm_log".
|
echo "server did not start within 10 minutes or crashed. Please check server log at $vllm_log".
|
||||||
return 1
|
return 1
|
||||||
else
|
else
|
||||||
return 0
|
return 0
|
||||||
@ -213,7 +234,7 @@ run_benchmark() {
|
|||||||
|
|
||||||
pkill -if vllm
|
pkill -if vllm
|
||||||
sleep 10
|
sleep 10
|
||||||
printf '=%.0s' $(seq 1 20)
|
echo "===================="
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
128
benchmarks/auto_tune/batch_auto_tune.sh
Executable file
128
benchmarks/auto_tune/batch_auto_tune.sh
Executable file
@ -0,0 +1,128 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
INPUT_JSON="$1"
|
||||||
|
GCS_PATH="$2" # Optional GCS path for uploading results for each run
|
||||||
|
|
||||||
|
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
|
||||||
|
AUTOTUNE_SCRIPT="$SCRIPT_DIR/auto_tune.sh"
|
||||||
|
|
||||||
|
if [[ -z "$INPUT_JSON" ]]; then
|
||||||
|
echo "Error: Input JSON file not provided."
|
||||||
|
echo "Usage: $0 <path_to_json_file> [gcs_upload_path]"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ ! -f "$INPUT_JSON" ]]; then
|
||||||
|
echo "Error: File not found at '$INPUT_JSON'"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if ! command -v jq &> /dev/null; then
|
||||||
|
echo "Error: 'jq' command not found. Please install jq to process the JSON input."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n "$GCS_PATH" ]] && ! command -v gcloud &> /dev/null; then
|
||||||
|
echo "Error: 'gcloud' command not found, but a GCS_PATH was provided."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
SUCCESS_COUNT=0
|
||||||
|
FAILURE_COUNT=0
|
||||||
|
FAILED_RUNS=()
|
||||||
|
SCRIPT_START_TIME=$(date +%s)
|
||||||
|
|
||||||
|
json_content=$(cat "$INPUT_JSON")
|
||||||
|
if ! num_runs=$(echo "$json_content" | jq 'length'); then
|
||||||
|
echo "Error: Invalid JSON in $INPUT_JSON. 'jq' failed to get array length." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Found $num_runs benchmark configurations in $INPUT_JSON."
|
||||||
|
echo "Starting benchmark runs..."
|
||||||
|
echo "--------------------------------------------------"
|
||||||
|
|
||||||
|
for i in $(seq 0 $(($num_runs - 1))); do
|
||||||
|
run_object=$(echo "$json_content" | jq ".[$i]")
|
||||||
|
|
||||||
|
RUN_START_TIME=$(date +%s)
|
||||||
|
ENV_VARS_ARRAY=()
|
||||||
|
# Dynamically create env vars from the JSON object's keys
|
||||||
|
for key in $(echo "$run_object" | jq -r 'keys_unsorted[]'); do
|
||||||
|
value=$(echo "$run_object" | jq -r ".$key")
|
||||||
|
var_name=$(echo "$key" | tr '[:lower:]' '[:upper:]' | tr -cd 'A-Z0-9_')
|
||||||
|
ENV_VARS_ARRAY+=("${var_name}=${value}")
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "Executing run #$((i+1))/$num_runs with parameters: ${ENV_VARS_ARRAY[*]}"
|
||||||
|
|
||||||
|
# Execute auto_tune.sh and capture output
|
||||||
|
RUN_OUTPUT_FILE=$(mktemp)
|
||||||
|
if env "${ENV_VARS_ARRAY[@]}" bash "$AUTOTUNE_SCRIPT" > >(tee -a "$RUN_OUTPUT_FILE") 2>&1; then
|
||||||
|
STATUS="SUCCESS"
|
||||||
|
((SUCCESS_COUNT++))
|
||||||
|
else
|
||||||
|
STATUS="FAILURE"
|
||||||
|
((FAILURE_COUNT++))
|
||||||
|
FAILED_RUNS+=("Run #$((i+1)): $(echo $run_object | jq -c .)")
|
||||||
|
fi
|
||||||
|
|
||||||
|
RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE")
|
||||||
|
rm "$RUN_OUTPUT_FILE"
|
||||||
|
|
||||||
|
# Parse results and optionally upload them to GCS
|
||||||
|
RUN_ID=""
|
||||||
|
RESULTS=""
|
||||||
|
GCS_RESULTS_URL=""
|
||||||
|
if [[ "$STATUS" == "SUCCESS" ]]; then
|
||||||
|
RESULT_FILE_PATH=$(echo "$RUN_OUTPUT" | grep 'RESULT_FILE=' | tail -n 1 | cut -d'=' -f2 | tr -s '/' || true)
|
||||||
|
|
||||||
|
if [[ -n "$RESULT_FILE_PATH" && -f "$RESULT_FILE_PATH" ]]; then
|
||||||
|
RUN_ID=$(basename "$(dirname "$RESULT_FILE_PATH")")
|
||||||
|
RESULT_DIR=$(dirname "$RESULT_FILE_PATH")
|
||||||
|
RESULTS=$(cat "$RESULT_FILE_PATH")
|
||||||
|
|
||||||
|
if [[ -n "$GCS_PATH" ]]; then
|
||||||
|
GCS_RESULTS_URL="${GCS_PATH}/${RUN_ID}"
|
||||||
|
echo "Uploading results to GCS..."
|
||||||
|
if gcloud storage rsync --recursive "$RESULT_DIR/" "$GCS_RESULTS_URL"; then
|
||||||
|
echo "GCS upload successful."
|
||||||
|
else
|
||||||
|
echo "Warning: GCS upload failed for RUN_ID $RUN_ID."
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "Warning: Could not find result file for a successful run."
|
||||||
|
STATUS="WARNING_NO_RESULT_FILE"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Add the results back into the JSON object for this run
|
||||||
|
json_content=$(echo "$json_content" | jq --argjson i "$i" --arg run_id "$RUN_ID" --arg status "$STATUS" --arg results "$RESULTS" --arg gcs_results "$GCS_RESULTS_URL" \
|
||||||
|
'.[$i] += {run_id: $run_id, status: $status, results: $results, gcs_results: $gcs_results}')
|
||||||
|
|
||||||
|
RUN_END_TIME=$(date +%s)
|
||||||
|
echo "Run finished in $((RUN_END_TIME - RUN_START_TIME)) seconds. Status: $STATUS"
|
||||||
|
echo "--------------------------------------------------"
|
||||||
|
|
||||||
|
# Save intermediate progress back to the file
|
||||||
|
echo "$json_content" > "$INPUT_JSON.tmp" && mv "$INPUT_JSON.tmp" "$INPUT_JSON"
|
||||||
|
|
||||||
|
done
|
||||||
|
|
||||||
|
SCRIPT_END_TIME=$(date +%s)
|
||||||
|
echo "All benchmark runs completed in $((SCRIPT_END_TIME - SCRIPT_START_TIME)) seconds."
|
||||||
|
echo
|
||||||
|
echo "====================== SUMMARY ======================"
|
||||||
|
echo "Successful runs: $SUCCESS_COUNT"
|
||||||
|
echo "Failed runs: $FAILURE_COUNT"
|
||||||
|
echo "==================================================="
|
||||||
|
|
||||||
|
if [[ $FAILURE_COUNT -gt 0 ]]; then
|
||||||
|
echo "Details of failed runs (see JSON file for full parameters):"
|
||||||
|
for failed in "${FAILED_RUNS[@]}"; do
|
||||||
|
echo " - $failed"
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Updated results have been saved to '$INPUT_JSON'."
|
||||||
@ -34,6 +34,7 @@ class RequestFuncInput:
|
|||||||
multi_modal_content: Optional[dict | list[dict]] = None
|
multi_modal_content: Optional[dict | list[dict]] = None
|
||||||
ignore_eos: bool = False
|
ignore_eos: bool = False
|
||||||
language: Optional[str] = None
|
language: Optional[str] = None
|
||||||
|
request_id: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -71,6 +72,9 @@ async def async_request_tgi(
|
|||||||
"inputs": request_func_input.prompt,
|
"inputs": request_func_input.prompt,
|
||||||
"parameters": params,
|
"parameters": params,
|
||||||
}
|
}
|
||||||
|
headers = None
|
||||||
|
if request_func_input.request_id:
|
||||||
|
headers = {"x-request-id": request_func_input.request_id}
|
||||||
output = RequestFuncOutput()
|
output = RequestFuncOutput()
|
||||||
output.prompt_len = request_func_input.prompt_len
|
output.prompt_len = request_func_input.prompt_len
|
||||||
if request_func_input.ignore_eos:
|
if request_func_input.ignore_eos:
|
||||||
@ -82,7 +86,9 @@ async def async_request_tgi(
|
|||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
most_recent_timestamp = st
|
most_recent_timestamp = st
|
||||||
try:
|
try:
|
||||||
async with session.post(url=api_url, json=payload) as response:
|
async with session.post(
|
||||||
|
url=api_url, json=payload, headers=headers
|
||||||
|
) as response:
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
async for chunk_bytes in response.content:
|
async for chunk_bytes in response.content:
|
||||||
chunk_bytes = chunk_bytes.strip()
|
chunk_bytes = chunk_bytes.strip()
|
||||||
@ -145,6 +151,9 @@ async def async_request_trt_llm(
|
|||||||
}
|
}
|
||||||
if request_func_input.ignore_eos:
|
if request_func_input.ignore_eos:
|
||||||
payload["min_length"] = request_func_input.output_len
|
payload["min_length"] = request_func_input.output_len
|
||||||
|
headers = None
|
||||||
|
if request_func_input.request_id:
|
||||||
|
headers = {"x-request-id": request_func_input.request_id}
|
||||||
output = RequestFuncOutput()
|
output = RequestFuncOutput()
|
||||||
output.prompt_len = request_func_input.prompt_len
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
@ -152,7 +161,9 @@ async def async_request_trt_llm(
|
|||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
most_recent_timestamp = st
|
most_recent_timestamp = st
|
||||||
try:
|
try:
|
||||||
async with session.post(url=api_url, json=payload) as response:
|
async with session.post(
|
||||||
|
url=api_url, json=payload, headers=headers
|
||||||
|
) as response:
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
async for chunk_bytes in response.content:
|
async for chunk_bytes in response.content:
|
||||||
chunk_bytes = chunk_bytes.strip()
|
chunk_bytes = chunk_bytes.strip()
|
||||||
@ -211,6 +222,8 @@ async def async_request_deepspeed_mii(
|
|||||||
"top_p": 1.0,
|
"top_p": 1.0,
|
||||||
}
|
}
|
||||||
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
||||||
|
if request_func_input.request_id:
|
||||||
|
headers["x-request-id"] = request_func_input.request_id
|
||||||
|
|
||||||
output = RequestFuncOutput()
|
output = RequestFuncOutput()
|
||||||
output.prompt_len = request_func_input.prompt_len
|
output.prompt_len = request_func_input.prompt_len
|
||||||
@ -283,6 +296,8 @@ async def async_request_openai_completions(
|
|||||||
if request_func_input.extra_body:
|
if request_func_input.extra_body:
|
||||||
payload.update(request_func_input.extra_body)
|
payload.update(request_func_input.extra_body)
|
||||||
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
||||||
|
if request_func_input.request_id:
|
||||||
|
headers["x-request-id"] = request_func_input.request_id
|
||||||
|
|
||||||
output = RequestFuncOutput()
|
output = RequestFuncOutput()
|
||||||
output.prompt_len = request_func_input.prompt_len
|
output.prompt_len = request_func_input.prompt_len
|
||||||
@ -395,6 +410,8 @@ async def async_request_openai_chat_completions(
|
|||||||
"Content-Type": "application/json",
|
"Content-Type": "application/json",
|
||||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
|
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
|
||||||
}
|
}
|
||||||
|
if request_func_input.request_id:
|
||||||
|
headers["x-request-id"] = request_func_input.request_id
|
||||||
|
|
||||||
output = RequestFuncOutput()
|
output = RequestFuncOutput()
|
||||||
output.prompt_len = request_func_input.prompt_len
|
output.prompt_len = request_func_input.prompt_len
|
||||||
@ -491,6 +508,8 @@ async def async_request_openai_audio(
|
|||||||
headers = {
|
headers = {
|
||||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
|
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
|
||||||
}
|
}
|
||||||
|
if request_func_input.request_id:
|
||||||
|
headers["x-request-id"] = request_func_input.request_id
|
||||||
|
|
||||||
# Send audio file
|
# Send audio file
|
||||||
def to_bytes(y, sr):
|
def to_bytes(y, sr):
|
||||||
|
|||||||
@ -57,7 +57,7 @@ def invoke_main() -> None:
|
|||||||
"--num-iteration",
|
"--num-iteration",
|
||||||
type=int,
|
type=int,
|
||||||
default=1000,
|
default=1000,
|
||||||
help="Number of iterations to run to stablize final data readings",
|
help="Number of iterations to run to stabilize final data readings",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--allocate-blocks",
|
"--allocate-blocks",
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -1,191 +1,17 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""Benchmark the latency of processing a single batch of requests."""
|
import sys
|
||||||
|
|
||||||
import argparse
|
|
||||||
import dataclasses
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
from typing import Any, Optional
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from tqdm import tqdm
|
|
||||||
from typing_extensions import deprecated
|
|
||||||
|
|
||||||
import vllm.envs as envs
|
|
||||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
|
||||||
from vllm import LLM, SamplingParams
|
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
|
||||||
from vllm.inputs import PromptType
|
|
||||||
from vllm.sampling_params import BeamSearchParams
|
|
||||||
from vllm.utils import FlexibleArgumentParser
|
|
||||||
|
|
||||||
|
|
||||||
def save_to_pytorch_benchmark_format(
|
|
||||||
args: argparse.Namespace, results: dict[str, Any]
|
|
||||||
) -> None:
|
|
||||||
pt_records = convert_to_pytorch_benchmark_format(
|
|
||||||
args=args,
|
|
||||||
metrics={"latency": results["latencies"]},
|
|
||||||
extra_info={k: results[k] for k in ["avg_latency", "percentiles"]},
|
|
||||||
)
|
|
||||||
if pt_records:
|
|
||||||
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
|
|
||||||
write_to_json(pt_file, pt_records)
|
|
||||||
|
|
||||||
|
|
||||||
@deprecated(
|
|
||||||
"benchmark_latency.py is deprecated and will be removed in a "
|
|
||||||
"future version. Please use 'vllm bench latency' instead.",
|
|
||||||
)
|
|
||||||
def main(args: argparse.Namespace):
|
|
||||||
print(args)
|
|
||||||
|
|
||||||
engine_args = EngineArgs.from_cli_args(args)
|
|
||||||
|
|
||||||
# NOTE(woosuk): If the request cannot be processed in a single batch,
|
|
||||||
# the engine will automatically process the request in multiple batches.
|
|
||||||
llm = LLM(**dataclasses.asdict(engine_args))
|
|
||||||
assert llm.llm_engine.model_config.max_model_len >= (
|
|
||||||
args.input_len + args.output_len
|
|
||||||
), (
|
|
||||||
"Please ensure that max_model_len is greater than"
|
|
||||||
" the sum of input_len and output_len."
|
|
||||||
)
|
|
||||||
|
|
||||||
sampling_params = SamplingParams(
|
|
||||||
n=args.n,
|
|
||||||
temperature=1.0,
|
|
||||||
top_p=1.0,
|
|
||||||
ignore_eos=True,
|
|
||||||
max_tokens=args.output_len,
|
|
||||||
detokenize=not args.disable_detokenize,
|
|
||||||
)
|
|
||||||
print(sampling_params)
|
|
||||||
dummy_prompt_token_ids = np.random.randint(
|
|
||||||
10000, size=(args.batch_size, args.input_len)
|
|
||||||
)
|
|
||||||
dummy_prompts: list[PromptType] = [
|
|
||||||
{"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
|
|
||||||
]
|
|
||||||
|
|
||||||
def llm_generate():
|
|
||||||
if not args.use_beam_search:
|
|
||||||
llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
|
|
||||||
else:
|
|
||||||
llm.beam_search(
|
|
||||||
dummy_prompts,
|
|
||||||
BeamSearchParams(
|
|
||||||
beam_width=args.n,
|
|
||||||
max_tokens=args.output_len,
|
|
||||||
ignore_eos=True,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
def run_to_completion(profile_dir: Optional[str] = None):
|
|
||||||
if profile_dir:
|
|
||||||
llm.start_profile()
|
|
||||||
llm_generate()
|
|
||||||
llm.stop_profile()
|
|
||||||
else:
|
|
||||||
start_time = time.perf_counter()
|
|
||||||
llm_generate()
|
|
||||||
end_time = time.perf_counter()
|
|
||||||
latency = end_time - start_time
|
|
||||||
return latency
|
|
||||||
|
|
||||||
print("Warming up...")
|
|
||||||
for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
|
|
||||||
run_to_completion(profile_dir=None)
|
|
||||||
|
|
||||||
if args.profile:
|
|
||||||
profile_dir = envs.VLLM_TORCH_PROFILER_DIR
|
|
||||||
print(f"Profiling (results will be saved to '{profile_dir}')...")
|
|
||||||
run_to_completion(profile_dir=profile_dir)
|
|
||||||
return
|
|
||||||
|
|
||||||
# Benchmark.
|
|
||||||
latencies = []
|
|
||||||
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
|
|
||||||
latencies.append(run_to_completion(profile_dir=None))
|
|
||||||
latencies = np.array(latencies)
|
|
||||||
percentages = [10, 25, 50, 75, 90, 99]
|
|
||||||
percentiles = np.percentile(latencies, percentages)
|
|
||||||
print(f"Avg latency: {np.mean(latencies)} seconds")
|
|
||||||
for percentage, percentile in zip(percentages, percentiles):
|
|
||||||
print(f"{percentage}% percentile latency: {percentile} seconds")
|
|
||||||
|
|
||||||
# Output JSON results if specified
|
|
||||||
if args.output_json:
|
|
||||||
results = {
|
|
||||||
"avg_latency": np.mean(latencies),
|
|
||||||
"latencies": latencies.tolist(),
|
|
||||||
"percentiles": dict(zip(percentages, percentiles.tolist())),
|
|
||||||
}
|
|
||||||
with open(args.output_json, "w") as f:
|
|
||||||
json.dump(results, f, indent=4)
|
|
||||||
save_to_pytorch_benchmark_format(args, results)
|
|
||||||
|
|
||||||
|
|
||||||
def create_argument_parser():
|
|
||||||
parser = FlexibleArgumentParser(
|
|
||||||
description="Benchmark the latency of processing a single batch of "
|
|
||||||
"requests till completion."
|
|
||||||
)
|
|
||||||
parser.add_argument("--input-len", type=int, default=32)
|
|
||||||
parser.add_argument("--output-len", type=int, default=128)
|
|
||||||
parser.add_argument("--batch-size", type=int, default=8)
|
|
||||||
parser.add_argument(
|
|
||||||
"--n",
|
|
||||||
type=int,
|
|
||||||
default=1,
|
|
||||||
help="Number of generated sequences per prompt.",
|
|
||||||
)
|
|
||||||
parser.add_argument("--use-beam-search", action="store_true")
|
|
||||||
parser.add_argument(
|
|
||||||
"--num-iters-warmup",
|
|
||||||
type=int,
|
|
||||||
default=10,
|
|
||||||
help="Number of iterations to run for warmup.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--num-iters", type=int, default=30, help="Number of iterations to run."
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--profile",
|
|
||||||
action="store_true",
|
|
||||||
help="profile the generation process of a single batch",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--output-json",
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help="Path to save the latency results in JSON format.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--disable-detokenize",
|
|
||||||
action="store_true",
|
|
||||||
help=(
|
|
||||||
"Do not detokenize responses (i.e. do not include "
|
|
||||||
"detokenization time in the latency measurement)"
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
parser = EngineArgs.add_cli_args(parser)
|
|
||||||
# V1 enables prefix caching by default which skews the latency
|
|
||||||
# numbers. We need to disable prefix caching by default.
|
|
||||||
parser.set_defaults(enable_prefix_caching=False)
|
|
||||||
|
|
||||||
return parser
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = create_argument_parser()
|
print("""DEPRECATED: This script has been moved to the vLLM CLI.
|
||||||
args = parser.parse_args()
|
|
||||||
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
|
Please use the following command instead:
|
||||||
raise OSError(
|
vllm bench latency
|
||||||
"The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
|
|
||||||
"Please set it to a valid path to use torch profiler."
|
For help with the new command, run:
|
||||||
)
|
vllm bench latency --help
|
||||||
main(args)
|
|
||||||
|
Alternatively, you can run the new command directly with:
|
||||||
|
python -m vllm.entrypoints.cli.main bench latency --help
|
||||||
|
""")
|
||||||
|
sys.exit(1)
|
||||||
|
|||||||
@ -1,17 +1,31 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import gc
|
import gc
|
||||||
|
import time
|
||||||
|
from unittest import mock
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
|
|
||||||
from benchmark_utils import TimeCollector
|
from benchmark_utils import TimeCollector
|
||||||
from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
|
from vllm.config import (
|
||||||
|
CacheConfig,
|
||||||
|
DeviceConfig,
|
||||||
|
LoadConfig,
|
||||||
|
ModelConfig,
|
||||||
|
ParallelConfig,
|
||||||
|
SchedulerConfig,
|
||||||
|
SpeculativeConfig,
|
||||||
|
VllmConfig,
|
||||||
|
)
|
||||||
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
|
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
|
||||||
|
from vllm.v1.worker.gpu_input_batch import InputBatch
|
||||||
|
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def benchmark_propose(args):
|
||||||
rows = []
|
rows = []
|
||||||
for max_ngram in args.max_ngram:
|
for max_ngram in args.max_ngram:
|
||||||
collector = TimeCollector(TimeCollector.US)
|
collector = TimeCollector(TimeCollector.US)
|
||||||
@ -69,15 +83,93 @@ def main(args):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def benchmark_batched_propose(args):
|
||||||
|
NUM_SPECULATIVE_TOKENS_NGRAM = 10
|
||||||
|
PROMPT_LOOKUP_MIN = 5
|
||||||
|
PROMPT_LOOKUP_MAX = 15
|
||||||
|
MAX_MODEL_LEN = int(1e7)
|
||||||
|
DEVICE = current_platform.device_type
|
||||||
|
|
||||||
|
model_config = ModelConfig(model="facebook/opt-125m", runner="generate")
|
||||||
|
|
||||||
|
speculative_config = SpeculativeConfig(
|
||||||
|
target_model_config=model_config,
|
||||||
|
target_parallel_config=ParallelConfig(),
|
||||||
|
method="ngram",
|
||||||
|
num_speculative_tokens=NUM_SPECULATIVE_TOKENS_NGRAM,
|
||||||
|
prompt_lookup_max=PROMPT_LOOKUP_MAX,
|
||||||
|
prompt_lookup_min=PROMPT_LOOKUP_MIN,
|
||||||
|
)
|
||||||
|
|
||||||
|
vllm_config = VllmConfig(
|
||||||
|
model_config=model_config,
|
||||||
|
cache_config=CacheConfig(),
|
||||||
|
speculative_config=speculative_config,
|
||||||
|
device_config=DeviceConfig(device=current_platform.device_type),
|
||||||
|
parallel_config=ParallelConfig(),
|
||||||
|
load_config=LoadConfig(),
|
||||||
|
scheduler_config=SchedulerConfig(),
|
||||||
|
)
|
||||||
|
|
||||||
|
# monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
|
||||||
|
mock_pp_group = mock.MagicMock()
|
||||||
|
mock_pp_group.world_size = 1
|
||||||
|
with mock.patch(
|
||||||
|
"vllm.v1.worker.gpu_model_runner.get_pp_group", return_value=mock_pp_group
|
||||||
|
):
|
||||||
|
runner = GPUModelRunner(vllm_config, DEVICE)
|
||||||
|
|
||||||
|
# hack max model len
|
||||||
|
runner.max_model_len = MAX_MODEL_LEN
|
||||||
|
runner.drafter.max_model_len = MAX_MODEL_LEN
|
||||||
|
|
||||||
|
dummy_input_batch = InputBatch(
|
||||||
|
max_num_reqs=args.num_req,
|
||||||
|
max_model_len=MAX_MODEL_LEN,
|
||||||
|
max_num_batched_tokens=args.num_req * args.num_token,
|
||||||
|
device=DEVICE,
|
||||||
|
pin_memory=False,
|
||||||
|
vocab_size=256000,
|
||||||
|
block_sizes=[16],
|
||||||
|
)
|
||||||
|
dummy_input_batch._req_ids = list(str(id) for id in range(args.num_req))
|
||||||
|
dummy_input_batch.spec_decode_unsupported_reqs = ()
|
||||||
|
dummy_input_batch.num_tokens_no_spec = [args.num_token] * args.num_req
|
||||||
|
dummy_input_batch.token_ids_cpu = np.random.randint(
|
||||||
|
0, 20, (args.num_req, args.num_token)
|
||||||
|
)
|
||||||
|
|
||||||
|
runner.input_batch = dummy_input_batch
|
||||||
|
|
||||||
|
sampled_token_ids = [[0]] * args.num_req
|
||||||
|
|
||||||
|
print("Starting benchmark")
|
||||||
|
# first run is warmup so ignore it
|
||||||
|
for _ in range(args.num_iteration):
|
||||||
|
start = time.time()
|
||||||
|
runner.drafter.propose(
|
||||||
|
sampled_token_ids,
|
||||||
|
dummy_input_batch.req_ids,
|
||||||
|
dummy_input_batch.num_tokens_no_spec,
|
||||||
|
dummy_input_batch.token_ids_cpu,
|
||||||
|
dummy_input_batch.spec_decode_unsupported_reqs,
|
||||||
|
)
|
||||||
|
end = time.time()
|
||||||
|
print(f"Iteration time (s): {end - start}")
|
||||||
|
|
||||||
|
|
||||||
def invoke_main() -> None:
|
def invoke_main() -> None:
|
||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description="Benchmark the performance of N-gram speculative decode drafting"
|
description="Benchmark the performance of N-gram speculative decode drafting"
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--batched", action="store_true", help="consider time to prepare batch"
|
||||||
|
) # noqa: E501
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--num-iteration",
|
"--num-iteration",
|
||||||
type=int,
|
type=int,
|
||||||
default=100,
|
default=100,
|
||||||
help="Number of iterations to run to stablize final data readings",
|
help="Number of iterations to run to stabilize final data readings",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--num-req", type=int, default=128, help="Number of requests in the batch"
|
"--num-req", type=int, default=128, help="Number of requests in the batch"
|
||||||
@ -105,8 +197,17 @@ def invoke_main() -> None:
|
|||||||
help="Number of speculative tokens to generate",
|
help="Number of speculative tokens to generate",
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
|
||||||
|
if not args.batched:
|
||||||
|
benchmark_propose(args)
|
||||||
|
else:
|
||||||
|
benchmark_batched_propose(args)
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
|
# Example command lines:
|
||||||
|
# time python3 benchmarks/benchmark_ngram_proposer.py
|
||||||
|
# time python3 benchmarks/benchmark_ngram_proposer.py --batched --num-iteration 4 --num-token 1000000 --num-req 128
|
||||||
|
""" # noqa: E501
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
invoke_main() # pragma: no cover
|
invoke_main() # pragma: no cover
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -449,7 +449,8 @@ async def benchmark(
|
|||||||
def prepare_extra_body(request) -> dict:
|
def prepare_extra_body(request) -> dict:
|
||||||
extra_body = {}
|
extra_body = {}
|
||||||
# Add the schema to the extra_body
|
# Add the schema to the extra_body
|
||||||
extra_body[request.structure_type] = request.schema
|
extra_body["structured_outputs"] = {}
|
||||||
|
extra_body["structured_outputs"][request.structure_type] = request.schema
|
||||||
return extra_body
|
return extra_body
|
||||||
|
|
||||||
print("Starting initial single prompt test run...")
|
print("Starting initial single prompt test run...")
|
||||||
@ -696,11 +697,11 @@ def evaluate(ret, args):
|
|||||||
return re.match(args.regex, actual) is not None
|
return re.match(args.regex, actual) is not None
|
||||||
|
|
||||||
def _eval_correctness(expected, actual):
|
def _eval_correctness(expected, actual):
|
||||||
if args.structure_type == "guided_json":
|
if args.structure_type == "json":
|
||||||
return _eval_correctness_json(expected, actual)
|
return _eval_correctness_json(expected, actual)
|
||||||
elif args.structure_type == "guided_regex":
|
elif args.structure_type == "regex":
|
||||||
return _eval_correctness_regex(expected, actual)
|
return _eval_correctness_regex(expected, actual)
|
||||||
elif args.structure_type == "guided_choice":
|
elif args.structure_type == "choice":
|
||||||
return _eval_correctness_choice(expected, actual)
|
return _eval_correctness_choice(expected, actual)
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
@ -780,18 +781,18 @@ def main(args: argparse.Namespace):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if args.dataset == "grammar":
|
if args.dataset == "grammar":
|
||||||
args.structure_type = "guided_grammar"
|
args.structure_type = "grammar"
|
||||||
elif args.dataset == "regex":
|
elif args.dataset == "regex":
|
||||||
args.structure_type = "guided_regex"
|
args.structure_type = "regex"
|
||||||
elif args.dataset == "choice":
|
elif args.dataset == "choice":
|
||||||
args.structure_type = "guided_choice"
|
args.structure_type = "choice"
|
||||||
else:
|
else:
|
||||||
args.structure_type = "guided_json"
|
args.structure_type = "json"
|
||||||
|
|
||||||
if args.no_structured_output:
|
if args.no_structured_output:
|
||||||
args.structured_output_ratio = 0
|
args.structured_output_ratio = 0
|
||||||
if args.save_results:
|
if args.save_results:
|
||||||
result_file_name = f"{args.structured_output_ratio}guided"
|
result_file_name = f"{args.structured_output_ratio}so"
|
||||||
result_file_name += f"_{backend}"
|
result_file_name += f"_{backend}"
|
||||||
result_file_name += f"_{args.request_rate}qps"
|
result_file_name += f"_{args.request_rate}qps"
|
||||||
result_file_name += f"_{args.model.split('/')[-1]}"
|
result_file_name += f"_{args.model.split('/')[-1]}"
|
||||||
@ -998,7 +999,7 @@ def create_argument_parser():
|
|||||||
"--percentile-metrics",
|
"--percentile-metrics",
|
||||||
type=str,
|
type=str,
|
||||||
default="ttft,tpot,itl",
|
default="ttft,tpot,itl",
|
||||||
help="Comma-separated list of selected metrics to report percentils. "
|
help="Comma-separated list of selected metrics to report percentiles. "
|
||||||
"This argument specifies the metrics to report percentiles. "
|
"This argument specifies the metrics to report percentiles. "
|
||||||
'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
|
'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
|
||||||
'Default value is "ttft,tpot,itl".',
|
'Default value is "ttft,tpot,itl".',
|
||||||
|
|||||||
@ -1,742 +1,17 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""Benchmark offline inference throughput."""
|
import sys
|
||||||
|
|
||||||
import argparse
|
|
||||||
import dataclasses
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import random
|
|
||||||
import time
|
|
||||||
import warnings
|
|
||||||
from typing import Any, Optional, Union
|
|
||||||
|
|
||||||
import torch
|
|
||||||
import uvloop
|
|
||||||
from tqdm import tqdm
|
|
||||||
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
|
|
||||||
from typing_extensions import deprecated
|
|
||||||
|
|
||||||
from benchmark_dataset import (
|
|
||||||
AIMODataset,
|
|
||||||
BurstGPTDataset,
|
|
||||||
ConversationDataset,
|
|
||||||
InstructCoderDataset,
|
|
||||||
RandomDataset,
|
|
||||||
SampleRequest,
|
|
||||||
ShareGPTDataset,
|
|
||||||
SonnetDataset,
|
|
||||||
VisionArenaDataset,
|
|
||||||
)
|
|
||||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
|
||||||
from vllm.entrypoints.openai.api_server import (
|
|
||||||
build_async_engine_client_from_engine_args,
|
|
||||||
)
|
|
||||||
from vllm.inputs import TextPrompt, TokensPrompt
|
|
||||||
from vllm.lora.request import LoRARequest
|
|
||||||
from vllm.outputs import RequestOutput
|
|
||||||
from vllm.sampling_params import BeamSearchParams
|
|
||||||
from vllm.utils import FlexibleArgumentParser, merge_async_iterators
|
|
||||||
|
|
||||||
|
|
||||||
def run_vllm(
|
|
||||||
requests: list[SampleRequest],
|
|
||||||
n: int,
|
|
||||||
engine_args: EngineArgs,
|
|
||||||
disable_detokenize: bool = False,
|
|
||||||
) -> tuple[float, Optional[list[RequestOutput]]]:
|
|
||||||
from vllm import LLM, SamplingParams
|
|
||||||
|
|
||||||
llm = LLM(**dataclasses.asdict(engine_args))
|
|
||||||
assert all(
|
|
||||||
llm.llm_engine.model_config.max_model_len
|
|
||||||
>= (request.prompt_len + request.expected_output_len)
|
|
||||||
for request in requests
|
|
||||||
), (
|
|
||||||
"Please ensure that max_model_len is greater than the sum of"
|
|
||||||
" prompt_len and expected_output_len for all requests."
|
|
||||||
)
|
|
||||||
# Add the requests to the engine.
|
|
||||||
prompts: list[Union[TextPrompt, TokensPrompt]] = []
|
|
||||||
sampling_params: list[SamplingParams] = []
|
|
||||||
for request in requests:
|
|
||||||
prompts.append(
|
|
||||||
TokensPrompt(
|
|
||||||
prompt_token_ids=request.prompt["prompt_token_ids"],
|
|
||||||
multi_modal_data=request.multi_modal_data,
|
|
||||||
)
|
|
||||||
if "prompt_token_ids" in request.prompt
|
|
||||||
else TextPrompt(
|
|
||||||
prompt=request.prompt, multi_modal_data=request.multi_modal_data
|
|
||||||
)
|
|
||||||
)
|
|
||||||
sampling_params.append(
|
|
||||||
SamplingParams(
|
|
||||||
n=n,
|
|
||||||
temperature=1.0,
|
|
||||||
top_p=1.0,
|
|
||||||
ignore_eos=True,
|
|
||||||
max_tokens=request.expected_output_len,
|
|
||||||
detokenize=not disable_detokenize,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
lora_requests: Optional[list[LoRARequest]] = None
|
|
||||||
if engine_args.enable_lora:
|
|
||||||
lora_requests = [request.lora_request for request in requests]
|
|
||||||
|
|
||||||
use_beam_search = False
|
|
||||||
|
|
||||||
outputs = None
|
|
||||||
if not use_beam_search:
|
|
||||||
start = time.perf_counter()
|
|
||||||
outputs = llm.generate(
|
|
||||||
prompts, sampling_params, lora_request=lora_requests, use_tqdm=True
|
|
||||||
)
|
|
||||||
end = time.perf_counter()
|
|
||||||
else:
|
|
||||||
assert lora_requests is None, "BeamSearch API does not support LoRA"
|
|
||||||
prompts = [request.prompt for request in requests]
|
|
||||||
# output_len should be the same for all requests.
|
|
||||||
output_len = requests[0].expected_output_len
|
|
||||||
for request in requests:
|
|
||||||
assert request.expected_output_len == output_len
|
|
||||||
start = time.perf_counter()
|
|
||||||
llm.beam_search(
|
|
||||||
prompts,
|
|
||||||
BeamSearchParams(
|
|
||||||
beam_width=n,
|
|
||||||
max_tokens=output_len,
|
|
||||||
ignore_eos=True,
|
|
||||||
),
|
|
||||||
)
|
|
||||||
end = time.perf_counter()
|
|
||||||
return end - start, outputs
|
|
||||||
|
|
||||||
|
|
||||||
def run_vllm_chat(
|
|
||||||
requests: list[SampleRequest],
|
|
||||||
n: int,
|
|
||||||
engine_args: EngineArgs,
|
|
||||||
disable_detokenize: bool = False,
|
|
||||||
) -> tuple[float, list[RequestOutput]]:
|
|
||||||
"""
|
|
||||||
Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
|
|
||||||
multimodal models as it properly handles multimodal inputs and chat
|
|
||||||
formatting. For non-multimodal models, use run_vllm() instead.
|
|
||||||
"""
|
|
||||||
from vllm import LLM, SamplingParams
|
|
||||||
|
|
||||||
llm = LLM(**dataclasses.asdict(engine_args))
|
|
||||||
|
|
||||||
assert all(
|
|
||||||
llm.llm_engine.model_config.max_model_len
|
|
||||||
>= (request.prompt_len + request.expected_output_len)
|
|
||||||
for request in requests
|
|
||||||
), (
|
|
||||||
"Please ensure that max_model_len is greater than the sum of "
|
|
||||||
"prompt_len and expected_output_len for all requests."
|
|
||||||
)
|
|
||||||
|
|
||||||
prompts = []
|
|
||||||
sampling_params: list[SamplingParams] = []
|
|
||||||
for request in requests:
|
|
||||||
prompts.append(request.prompt)
|
|
||||||
sampling_params.append(
|
|
||||||
SamplingParams(
|
|
||||||
n=n,
|
|
||||||
temperature=1.0,
|
|
||||||
top_p=1.0,
|
|
||||||
ignore_eos=True,
|
|
||||||
max_tokens=request.expected_output_len,
|
|
||||||
detokenize=not disable_detokenize,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
start = time.perf_counter()
|
|
||||||
outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
|
|
||||||
end = time.perf_counter()
|
|
||||||
return end - start, outputs
|
|
||||||
|
|
||||||
|
|
||||||
async def run_vllm_async(
|
|
||||||
requests: list[SampleRequest],
|
|
||||||
n: int,
|
|
||||||
engine_args: AsyncEngineArgs,
|
|
||||||
disable_frontend_multiprocessing: bool = False,
|
|
||||||
disable_detokenize: bool = False,
|
|
||||||
) -> float:
|
|
||||||
from vllm import SamplingParams
|
|
||||||
|
|
||||||
async with build_async_engine_client_from_engine_args(
|
|
||||||
engine_args,
|
|
||||||
disable_frontend_multiprocessing=disable_frontend_multiprocessing,
|
|
||||||
) as llm:
|
|
||||||
model_config = await llm.get_model_config()
|
|
||||||
assert all(
|
|
||||||
model_config.max_model_len
|
|
||||||
>= (request.prompt_len + request.expected_output_len)
|
|
||||||
for request in requests
|
|
||||||
), (
|
|
||||||
"Please ensure that max_model_len is greater than the sum of"
|
|
||||||
" prompt_len and expected_output_len for all requests."
|
|
||||||
)
|
|
||||||
|
|
||||||
# Add the requests to the engine.
|
|
||||||
prompts: list[Union[TextPrompt, TokensPrompt]] = []
|
|
||||||
sampling_params: list[SamplingParams] = []
|
|
||||||
lora_requests: list[Optional[LoRARequest]] = []
|
|
||||||
for request in requests:
|
|
||||||
prompts.append(
|
|
||||||
TokensPrompt(
|
|
||||||
prompt_token_ids=request.prompt["prompt_token_ids"],
|
|
||||||
multi_modal_data=request.multi_modal_data,
|
|
||||||
)
|
|
||||||
if "prompt_token_ids" in request.prompt
|
|
||||||
else TextPrompt(
|
|
||||||
prompt=request.prompt, multi_modal_data=request.multi_modal_data
|
|
||||||
)
|
|
||||||
)
|
|
||||||
sampling_params.append(
|
|
||||||
SamplingParams(
|
|
||||||
n=n,
|
|
||||||
temperature=1.0,
|
|
||||||
top_p=1.0,
|
|
||||||
ignore_eos=True,
|
|
||||||
max_tokens=request.expected_output_len,
|
|
||||||
detokenize=not disable_detokenize,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
lora_requests.append(request.lora_request)
|
|
||||||
|
|
||||||
generators = []
|
|
||||||
start = time.perf_counter()
|
|
||||||
for i, (prompt, sp, lr) in enumerate(
|
|
||||||
zip(prompts, sampling_params, lora_requests)
|
|
||||||
):
|
|
||||||
generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}")
|
|
||||||
generators.append(generator)
|
|
||||||
all_gens = merge_async_iterators(*generators)
|
|
||||||
async for i, res in all_gens:
|
|
||||||
pass
|
|
||||||
end = time.perf_counter()
|
|
||||||
return end - start
|
|
||||||
|
|
||||||
|
|
||||||
def run_hf(
|
|
||||||
requests: list[SampleRequest],
|
|
||||||
model: str,
|
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
|
||||||
n: int,
|
|
||||||
max_batch_size: int,
|
|
||||||
trust_remote_code: bool,
|
|
||||||
disable_detokenize: bool = False,
|
|
||||||
) -> float:
|
|
||||||
llm = AutoModelForCausalLM.from_pretrained(
|
|
||||||
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
|
|
||||||
)
|
|
||||||
if llm.config.model_type == "llama":
|
|
||||||
# To enable padding in the HF backend.
|
|
||||||
tokenizer.pad_token = tokenizer.eos_token
|
|
||||||
llm = llm.cuda()
|
|
||||||
|
|
||||||
pbar = tqdm(total=len(requests))
|
|
||||||
start = time.perf_counter()
|
|
||||||
batch: list[str] = []
|
|
||||||
max_prompt_len = 0
|
|
||||||
max_output_len = 0
|
|
||||||
for i in range(len(requests)):
|
|
||||||
prompt = requests[i].prompt
|
|
||||||
prompt_len = requests[i].prompt_len
|
|
||||||
output_len = requests[i].expected_output_len
|
|
||||||
# Add the prompt to the batch.
|
|
||||||
batch.append(prompt)
|
|
||||||
max_prompt_len = max(max_prompt_len, prompt_len)
|
|
||||||
max_output_len = max(max_output_len, output_len)
|
|
||||||
if len(batch) < max_batch_size and i != len(requests) - 1:
|
|
||||||
# Check if we can add more requests to the batch.
|
|
||||||
next_prompt_len = requests[i + 1].prompt_len
|
|
||||||
next_output_len = requests[i + 1].expected_output_len
|
|
||||||
if (
|
|
||||||
max(max_prompt_len, next_prompt_len)
|
|
||||||
+ max(max_output_len, next_output_len)
|
|
||||||
) <= 2048:
|
|
||||||
# We can add more requests to the batch.
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Generate the sequences.
|
|
||||||
input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids
|
|
||||||
llm_outputs = llm.generate(
|
|
||||||
input_ids=input_ids.cuda(),
|
|
||||||
do_sample=True,
|
|
||||||
num_return_sequences=n,
|
|
||||||
temperature=1.0,
|
|
||||||
top_p=1.0,
|
|
||||||
use_cache=True,
|
|
||||||
max_new_tokens=max_output_len,
|
|
||||||
)
|
|
||||||
if not disable_detokenize:
|
|
||||||
# Include the decoding time.
|
|
||||||
tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
|
|
||||||
pbar.update(len(batch))
|
|
||||||
|
|
||||||
# Clear the batch.
|
|
||||||
batch = []
|
|
||||||
max_prompt_len = 0
|
|
||||||
max_output_len = 0
|
|
||||||
end = time.perf_counter()
|
|
||||||
return end - start
|
|
||||||
|
|
||||||
|
|
||||||
def run_mii(
|
|
||||||
requests: list[SampleRequest],
|
|
||||||
model: str,
|
|
||||||
tensor_parallel_size: int,
|
|
||||||
output_len: int,
|
|
||||||
) -> float:
|
|
||||||
from mii import client, serve
|
|
||||||
|
|
||||||
llm = serve(model, tensor_parallel=tensor_parallel_size)
|
|
||||||
prompts = [request.prompt for request in requests]
|
|
||||||
|
|
||||||
start = time.perf_counter()
|
|
||||||
llm.generate(prompts, max_new_tokens=output_len)
|
|
||||||
end = time.perf_counter()
|
|
||||||
client = client(model)
|
|
||||||
client.terminate_server()
|
|
||||||
return end - start
|
|
||||||
|
|
||||||
|
|
||||||
def save_to_pytorch_benchmark_format(
|
|
||||||
args: argparse.Namespace, results: dict[str, Any]
|
|
||||||
) -> None:
|
|
||||||
pt_records = convert_to_pytorch_benchmark_format(
|
|
||||||
args=args,
|
|
||||||
metrics={
|
|
||||||
"requests_per_second": [results["requests_per_second"]],
|
|
||||||
"tokens_per_second": [results["tokens_per_second"]],
|
|
||||||
},
|
|
||||||
extra_info={
|
|
||||||
k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"]
|
|
||||||
},
|
|
||||||
)
|
|
||||||
if pt_records:
|
|
||||||
# Don't use json suffix here as we don't want CI to pick it up
|
|
||||||
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
|
|
||||||
write_to_json(pt_file, pt_records)
|
|
||||||
|
|
||||||
|
|
||||||
def get_requests(args, tokenizer):
|
|
||||||
# Common parameters for all dataset types.
|
|
||||||
common_kwargs = {
|
|
||||||
"dataset_path": args.dataset_path,
|
|
||||||
"random_seed": args.seed,
|
|
||||||
}
|
|
||||||
sample_kwargs = {
|
|
||||||
"tokenizer": tokenizer,
|
|
||||||
"lora_path": args.lora_path,
|
|
||||||
"max_loras": args.max_loras,
|
|
||||||
"num_requests": args.num_prompts,
|
|
||||||
"input_len": args.input_len,
|
|
||||||
"output_len": args.output_len,
|
|
||||||
}
|
|
||||||
|
|
||||||
if args.dataset_path is None or args.dataset_name == "random":
|
|
||||||
sample_kwargs["range_ratio"] = args.random_range_ratio
|
|
||||||
sample_kwargs["prefix_len"] = args.prefix_len
|
|
||||||
dataset_cls = RandomDataset
|
|
||||||
elif args.dataset_name == "sharegpt":
|
|
||||||
dataset_cls = ShareGPTDataset
|
|
||||||
if args.backend == "vllm-chat":
|
|
||||||
sample_kwargs["enable_multimodal_chat"] = True
|
|
||||||
elif args.dataset_name == "sonnet":
|
|
||||||
assert tokenizer.chat_template or tokenizer.default_chat_template, (
|
|
||||||
"Tokenizer/model must have chat template for sonnet dataset."
|
|
||||||
)
|
|
||||||
dataset_cls = SonnetDataset
|
|
||||||
sample_kwargs["prefix_len"] = args.prefix_len
|
|
||||||
sample_kwargs["return_prompt_formatted"] = True
|
|
||||||
elif args.dataset_name == "burstgpt":
|
|
||||||
dataset_cls = BurstGPTDataset
|
|
||||||
elif args.dataset_name == "hf":
|
|
||||||
common_kwargs["no_stream"] = args.no_stream
|
|
||||||
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
|
|
||||||
dataset_cls = VisionArenaDataset
|
|
||||||
common_kwargs["dataset_subset"] = None
|
|
||||||
common_kwargs["dataset_split"] = "train"
|
|
||||||
sample_kwargs["enable_multimodal_chat"] = True
|
|
||||||
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
|
|
||||||
dataset_cls = InstructCoderDataset
|
|
||||||
common_kwargs["dataset_split"] = "train"
|
|
||||||
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
|
|
||||||
dataset_cls = ConversationDataset
|
|
||||||
common_kwargs["dataset_subset"] = args.hf_subset
|
|
||||||
common_kwargs["dataset_split"] = args.hf_split
|
|
||||||
sample_kwargs["enable_multimodal_chat"] = True
|
|
||||||
elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
|
|
||||||
dataset_cls = AIMODataset
|
|
||||||
common_kwargs["dataset_subset"] = None
|
|
||||||
common_kwargs["dataset_split"] = "train"
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unknown dataset name: {args.dataset_name}")
|
|
||||||
# Remove None values
|
|
||||||
sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None}
|
|
||||||
return dataset_cls(**common_kwargs).sample(**sample_kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
@deprecated(
|
|
||||||
"benchmark_throughput.py is deprecated and will be removed in a "
|
|
||||||
"future version. Please use 'vllm bench throughput' instead.",
|
|
||||||
)
|
|
||||||
def main(args: argparse.Namespace):
|
|
||||||
if args.seed is None:
|
|
||||||
args.seed = 0
|
|
||||||
print(args)
|
|
||||||
random.seed(args.seed)
|
|
||||||
# Sample the requests.
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
|
||||||
args.tokenizer, trust_remote_code=args.trust_remote_code
|
|
||||||
)
|
|
||||||
requests = get_requests(args, tokenizer)
|
|
||||||
is_multi_modal = any(request.multi_modal_data is not None for request in requests)
|
|
||||||
request_outputs: Optional[list[RequestOutput]] = None
|
|
||||||
if args.backend == "vllm":
|
|
||||||
if args.async_engine:
|
|
||||||
elapsed_time = uvloop.run(
|
|
||||||
run_vllm_async(
|
|
||||||
requests,
|
|
||||||
args.n,
|
|
||||||
AsyncEngineArgs.from_cli_args(args),
|
|
||||||
args.disable_frontend_multiprocessing,
|
|
||||||
args.disable_detokenize,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
elapsed_time, request_outputs = run_vllm(
|
|
||||||
requests,
|
|
||||||
args.n,
|
|
||||||
EngineArgs.from_cli_args(args),
|
|
||||||
args.disable_detokenize,
|
|
||||||
)
|
|
||||||
elif args.backend == "hf":
|
|
||||||
assert args.tensor_parallel_size == 1
|
|
||||||
elapsed_time = run_hf(
|
|
||||||
requests,
|
|
||||||
args.model,
|
|
||||||
tokenizer,
|
|
||||||
args.n,
|
|
||||||
args.hf_max_batch_size,
|
|
||||||
args.trust_remote_code,
|
|
||||||
args.disable_detokenize,
|
|
||||||
)
|
|
||||||
elif args.backend == "mii":
|
|
||||||
elapsed_time = run_mii(
|
|
||||||
requests, args.model, args.tensor_parallel_size, args.output_len
|
|
||||||
)
|
|
||||||
elif args.backend == "vllm-chat":
|
|
||||||
elapsed_time, request_outputs = run_vllm_chat(
|
|
||||||
requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unknown backend: {args.backend}")
|
|
||||||
|
|
||||||
if request_outputs:
|
|
||||||
# Note: with the vllm and vllm-chat backends,
|
|
||||||
# we have request_outputs, which we use to count tokens.
|
|
||||||
total_prompt_tokens = 0
|
|
||||||
total_output_tokens = 0
|
|
||||||
for ro in request_outputs:
|
|
||||||
if not isinstance(ro, RequestOutput):
|
|
||||||
continue
|
|
||||||
total_prompt_tokens += (
|
|
||||||
len(ro.prompt_token_ids) if ro.prompt_token_ids else 0
|
|
||||||
)
|
|
||||||
total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o)
|
|
||||||
total_num_tokens = total_prompt_tokens + total_output_tokens
|
|
||||||
else:
|
|
||||||
total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests)
|
|
||||||
total_output_tokens = sum(r.expected_output_len for r in requests)
|
|
||||||
total_prompt_tokens = total_num_tokens - total_output_tokens
|
|
||||||
|
|
||||||
if is_multi_modal and args.backend != "vllm-chat":
|
|
||||||
print(
|
|
||||||
"\033[91mWARNING\033[0m: Multi-modal request with "
|
|
||||||
f"{args.backend} backend detected. The "
|
|
||||||
"following metrics are not accurate because image tokens are not"
|
|
||||||
" counted. See vllm-project/vllm/issues/9778 for details."
|
|
||||||
)
|
|
||||||
# TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
|
|
||||||
# vllm-chat backend counts the image tokens now
|
|
||||||
|
|
||||||
print(
|
|
||||||
f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
|
||||||
f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
|
|
||||||
f"{total_output_tokens / elapsed_time:.2f} output tokens/s"
|
|
||||||
)
|
|
||||||
print(f"Total num prompt tokens: {total_prompt_tokens}")
|
|
||||||
print(f"Total num output tokens: {total_output_tokens}")
|
|
||||||
|
|
||||||
# Output JSON results if specified
|
|
||||||
if args.output_json:
|
|
||||||
results = {
|
|
||||||
"elapsed_time": elapsed_time,
|
|
||||||
"num_requests": len(requests),
|
|
||||||
"total_num_tokens": total_num_tokens,
|
|
||||||
"requests_per_second": len(requests) / elapsed_time,
|
|
||||||
"tokens_per_second": total_num_tokens / elapsed_time,
|
|
||||||
}
|
|
||||||
with open(args.output_json, "w") as f:
|
|
||||||
json.dump(results, f, indent=4)
|
|
||||||
save_to_pytorch_benchmark_format(args, results)
|
|
||||||
|
|
||||||
|
|
||||||
def validate_args(args):
|
|
||||||
"""
|
|
||||||
Validate command-line arguments.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# === Deprecation and Defaulting ===
|
|
||||||
if args.dataset is not None:
|
|
||||||
warnings.warn(
|
|
||||||
"The '--dataset' argument will be deprecated in the next release. "
|
|
||||||
"Please use '--dataset-name' and '--dataset-path' instead.",
|
|
||||||
stacklevel=2,
|
|
||||||
)
|
|
||||||
args.dataset_path = args.dataset
|
|
||||||
|
|
||||||
if not getattr(args, "tokenizer", None):
|
|
||||||
args.tokenizer = args.model
|
|
||||||
|
|
||||||
# === Backend Validation ===
|
|
||||||
valid_backends = {"vllm", "hf", "mii", "vllm-chat"}
|
|
||||||
if args.backend not in valid_backends:
|
|
||||||
raise ValueError(f"Unsupported backend: {args.backend}")
|
|
||||||
|
|
||||||
# === Dataset Configuration ===
|
|
||||||
if not args.dataset and not args.dataset_path:
|
|
||||||
print("When dataset path is not set, it will default to random dataset")
|
|
||||||
args.dataset_name = "random"
|
|
||||||
if args.input_len is None:
|
|
||||||
raise ValueError("input_len must be provided for a random dataset")
|
|
||||||
|
|
||||||
# === Dataset Name Specific Checks ===
|
|
||||||
# --hf-subset and --hf-split: only used
|
|
||||||
# when dataset_name is 'hf'
|
|
||||||
if args.dataset_name != "hf" and (
|
|
||||||
getattr(args, "hf_subset", None) is not None
|
|
||||||
or getattr(args, "hf_split", None) is not None
|
|
||||||
):
|
|
||||||
warnings.warn(
|
|
||||||
"--hf-subset and --hf-split will be ignored \
|
|
||||||
since --dataset-name is not 'hf'.",
|
|
||||||
stacklevel=2,
|
|
||||||
)
|
|
||||||
elif args.dataset_name == "hf":
|
|
||||||
if args.dataset_path in (
|
|
||||||
VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
|
|
||||||
| ConversationDataset.SUPPORTED_DATASET_PATHS
|
|
||||||
):
|
|
||||||
assert args.backend == "vllm-chat", (
|
|
||||||
f"{args.dataset_path} needs to use vllm-chat as the backend."
|
|
||||||
) # noqa: E501
|
|
||||||
elif args.dataset_path in (
|
|
||||||
InstructCoderDataset.SUPPORTED_DATASET_PATHS
|
|
||||||
| AIMODataset.SUPPORTED_DATASET_PATHS
|
|
||||||
):
|
|
||||||
assert args.backend == "vllm", (
|
|
||||||
f"{args.dataset_path} needs to use vllm as the backend."
|
|
||||||
) # noqa: E501
|
|
||||||
else:
|
|
||||||
raise ValueError(f"{args.dataset_path} is not supported by hf dataset.")
|
|
||||||
|
|
||||||
# --random-range-ratio: only used when dataset_name is 'random'
|
|
||||||
if args.dataset_name != "random" and args.random_range_ratio is not None:
|
|
||||||
warnings.warn(
|
|
||||||
"--random-range-ratio will be ignored since \
|
|
||||||
--dataset-name is not 'random'.",
|
|
||||||
stacklevel=2,
|
|
||||||
)
|
|
||||||
|
|
||||||
# --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
|
|
||||||
# set.
|
|
||||||
if (
|
|
||||||
args.dataset_name not in {"random", "sonnet", None}
|
|
||||||
and args.prefix_len is not None
|
|
||||||
):
|
|
||||||
warnings.warn(
|
|
||||||
"--prefix-len will be ignored since --dataset-name\
|
|
||||||
is not 'random', 'sonnet', or not set.",
|
|
||||||
stacklevel=2,
|
|
||||||
)
|
|
||||||
|
|
||||||
# === LoRA Settings ===
|
|
||||||
if getattr(args, "enable_lora", False) and args.backend != "vllm":
|
|
||||||
raise ValueError("LoRA benchmarking is only supported for vLLM backend")
|
|
||||||
if getattr(args, "enable_lora", False) and args.lora_path is None:
|
|
||||||
raise ValueError("LoRA path must be provided when enable_lora is True")
|
|
||||||
|
|
||||||
# === Backend-specific Validations ===
|
|
||||||
if args.backend == "hf" and args.hf_max_batch_size is None:
|
|
||||||
raise ValueError("HF max batch size is required for HF backend")
|
|
||||||
if args.backend != "hf" and args.hf_max_batch_size is not None:
|
|
||||||
raise ValueError("HF max batch size is only for HF backend.")
|
|
||||||
|
|
||||||
if (
|
|
||||||
args.backend in {"hf", "mii"}
|
|
||||||
and getattr(args, "quantization", None) is not None
|
|
||||||
):
|
|
||||||
raise ValueError("Quantization is only for vLLM backend.")
|
|
||||||
|
|
||||||
if args.backend == "mii" and args.dtype != "auto":
|
|
||||||
raise ValueError("dtype must be auto for MII backend.")
|
|
||||||
if args.backend == "mii" and args.n != 1:
|
|
||||||
raise ValueError("n must be 1 for MII backend.")
|
|
||||||
if args.backend == "mii" and args.tokenizer != args.model:
|
|
||||||
raise ValueError("Tokenizer must be the same as the model for MII backend.")
|
|
||||||
|
|
||||||
# --data-parallel is not supported currently.
|
|
||||||
# https://github.com/vllm-project/vllm/issues/16222
|
|
||||||
if args.data_parallel_size > 1:
|
|
||||||
raise ValueError(
|
|
||||||
"Data parallel is not supported in offline benchmark, \
|
|
||||||
please use benchmark serving instead"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def create_argument_parser():
|
|
||||||
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
|
|
||||||
parser.add_argument(
|
|
||||||
"--backend",
|
|
||||||
type=str,
|
|
||||||
choices=["vllm", "hf", "mii", "vllm-chat"],
|
|
||||||
default="vllm",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--dataset-name",
|
|
||||||
type=str,
|
|
||||||
choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
|
|
||||||
help="Name of the dataset to benchmark on.",
|
|
||||||
default="sharegpt",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--no-stream",
|
|
||||||
action="store_true",
|
|
||||||
help="Do not load the dataset in streaming mode.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--dataset",
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help="Path to the ShareGPT dataset, will be deprecated in\
|
|
||||||
the next release. The dataset is expected to "
|
|
||||||
"be a json in form of list[dict[..., conversations: "
|
|
||||||
"list[dict[..., value: <prompt_or_response>]]]]",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--dataset-path", type=str, default=None, help="Path to the dataset"
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--input-len",
|
|
||||||
type=int,
|
|
||||||
default=None,
|
|
||||||
help="Input prompt length for each request",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--output-len",
|
|
||||||
type=int,
|
|
||||||
default=None,
|
|
||||||
help="Output length for each request. Overrides the "
|
|
||||||
"output length from the dataset.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--n", type=int, default=1, help="Number of generated sequences per prompt."
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--num-prompts", type=int, default=1000, help="Number of prompts to process."
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--hf-max-batch-size",
|
|
||||||
type=int,
|
|
||||||
default=None,
|
|
||||||
help="Maximum batch size for HF backend.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--output-json",
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help="Path to save the throughput results in JSON format.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--async-engine",
|
|
||||||
action="store_true",
|
|
||||||
default=False,
|
|
||||||
help="Use vLLM async engine rather than LLM class.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--disable-frontend-multiprocessing",
|
|
||||||
action="store_true",
|
|
||||||
default=False,
|
|
||||||
help="Disable decoupled async engine frontend.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--disable-detokenize",
|
|
||||||
action="store_true",
|
|
||||||
help=(
|
|
||||||
"Do not detokenize the response (i.e. do not include "
|
|
||||||
"detokenization time in the measurement)"
|
|
||||||
),
|
|
||||||
)
|
|
||||||
# LoRA
|
|
||||||
parser.add_argument(
|
|
||||||
"--lora-path",
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help="Path to the LoRA adapters to use. This can be an absolute path, "
|
|
||||||
"a relative path, or a Hugging Face model identifier.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--prefix-len",
|
|
||||||
type=int,
|
|
||||||
default=None,
|
|
||||||
help=f"Number of prefix tokens to be used in RandomDataset "
|
|
||||||
"and SonnetDataset. For RandomDataset, the total input "
|
|
||||||
"length is the sum of prefix-len (default: "
|
|
||||||
f"{RandomDataset.DEFAULT_PREFIX_LEN}) and a random context length "
|
|
||||||
"sampled from [input_len * (1 - range_ratio), "
|
|
||||||
"input_len * (1 + range_ratio)]. For SonnetDataset, "
|
|
||||||
f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) "
|
|
||||||
"controls how much of the input is fixed lines versus "
|
|
||||||
"random lines, but the total input length remains approximately "
|
|
||||||
"input_len tokens.",
|
|
||||||
)
|
|
||||||
# random dataset
|
|
||||||
parser.add_argument(
|
|
||||||
"--random-range-ratio",
|
|
||||||
type=float,
|
|
||||||
default=None,
|
|
||||||
help=f"Range ratio (default : {RandomDataset.DEFAULT_RANGE_RATIO}) "
|
|
||||||
"for sampling input/output length, "
|
|
||||||
"used only for RandomDataset. Must be in the range [0, 1) to "
|
|
||||||
"define a symmetric sampling range "
|
|
||||||
"[length * (1 - range_ratio), length * (1 + range_ratio)].",
|
|
||||||
)
|
|
||||||
|
|
||||||
# hf dtaset
|
|
||||||
parser.add_argument(
|
|
||||||
"--hf-subset", type=str, default=None, help="Subset of the HF dataset."
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--hf-split", type=str, default=None, help="Split of the HF dataset."
|
|
||||||
)
|
|
||||||
|
|
||||||
parser = AsyncEngineArgs.add_cli_args(parser)
|
|
||||||
|
|
||||||
return parser
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = create_argument_parser()
|
print("""DEPRECATED: This script has been moved to the vLLM CLI.
|
||||||
args = parser.parse_args()
|
|
||||||
if args.tokenizer is None:
|
Please use the following command instead:
|
||||||
args.tokenizer = args.model
|
vllm bench throughput
|
||||||
validate_args(args)
|
|
||||||
main(args)
|
For help with the new command, run:
|
||||||
|
vllm bench throughput --help
|
||||||
|
|
||||||
|
Alternatively, you can run the new command directly with:
|
||||||
|
python -m vllm.entrypoints.cli.main bench throughput --help
|
||||||
|
""")
|
||||||
|
sys.exit(1)
|
||||||
|
|||||||
@ -17,7 +17,7 @@ from weight_shapes import WEIGHT_SHAPES
|
|||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
w8a8_block_fp8_matmul,
|
w8a8_triton_block_scaled_mm,
|
||||||
)
|
)
|
||||||
from vllm.utils import FlexibleArgumentParser, cdiv
|
from vllm.utils import FlexibleArgumentParser, cdiv
|
||||||
|
|
||||||
@ -158,7 +158,7 @@ def bench_fp8(
|
|||||||
"cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
|
"cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
|
||||||
a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
|
a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
|
||||||
),
|
),
|
||||||
"triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
|
"triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm(
|
||||||
a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
|
a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
|
||||||
),
|
),
|
||||||
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
|
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
|
||||||
|
|||||||
@ -55,24 +55,20 @@ benchmark() {
|
|||||||
output_len=$2
|
output_len=$2
|
||||||
|
|
||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=0 python3 \
|
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
--model $model \
|
|
||||||
--port 8100 \
|
--port 8100 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||||
|
|
||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=1 python3 \
|
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
--model $model \
|
|
||||||
--port 8200 \
|
--port 8200 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||||
|
|
||||||
wait_for_server 8100
|
wait_for_server 8100
|
||||||
wait_for_server 8200
|
wait_for_server 8200
|
||||||
|
|||||||
@ -38,16 +38,12 @@ wait_for_server() {
|
|||||||
launch_chunked_prefill() {
|
launch_chunked_prefill() {
|
||||||
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||||
# disagg prefill
|
# disagg prefill
|
||||||
CUDA_VISIBLE_DEVICES=0 python3 \
|
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
--model $model \
|
|
||||||
--port 8100 \
|
--port 8100 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--enable-chunked-prefill \
|
--enable-chunked-prefill \
|
||||||
--gpu-memory-utilization 0.6 &
|
--gpu-memory-utilization 0.6 &
|
||||||
CUDA_VISIBLE_DEVICES=1 python3 \
|
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
--model $model \
|
|
||||||
--port 8200 \
|
--port 8200 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--enable-chunked-prefill \
|
--enable-chunked-prefill \
|
||||||
@ -62,23 +58,19 @@ launch_chunked_prefill() {
|
|||||||
launch_disagg_prefill() {
|
launch_disagg_prefill() {
|
||||||
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
model="meta-llama/Meta-Llama-3.1-8B-Instruct"
|
||||||
# disagg prefill
|
# disagg prefill
|
||||||
CUDA_VISIBLE_DEVICES=0 python3 \
|
CUDA_VISIBLE_DEVICES=0 vllm serve $model \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
--model $model \
|
|
||||||
--port 8100 \
|
--port 8100 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
'{"kv_connector":"PyNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||||
|
|
||||||
CUDA_VISIBLE_DEVICES=1 python3 \
|
CUDA_VISIBLE_DEVICES=1 vllm serve $model \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
|
||||||
--model $model \
|
|
||||||
--port 8200 \
|
--port 8200 \
|
||||||
--max-model-len 10000 \
|
--max-model-len 10000 \
|
||||||
--gpu-memory-utilization 0.6 \
|
--gpu-memory-utilization 0.6 \
|
||||||
--kv-transfer-config \
|
--kv-transfer-config \
|
||||||
'{"kv_connector":"PyNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
'{"kv_connector":"P2pNcclConnector","kv_role":"kv_consumer","kv_rank":1,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
|
||||||
|
|
||||||
wait_for_server 8100
|
wait_for_server 8100
|
||||||
wait_for_server 8200
|
wait_for_server 8200
|
||||||
|
|||||||
145
benchmarks/kernels/bench_block_fp8_gemm.py
Normal file
145
benchmarks/kernels/bench_block_fp8_gemm.py
Normal file
@ -0,0 +1,145 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
|
apply_w8a8_block_fp8_linear,
|
||||||
|
)
|
||||||
|
from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
|
||||||
|
CUTLASS_BLOCK_FP8_SUPPORTED,
|
||||||
|
)
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.triton_utils import triton as vllm_triton
|
||||||
|
|
||||||
|
assert current_platform.is_cuda(), (
|
||||||
|
"Only support benchmarking w8a8 block fp8 kernel on CUDA device."
|
||||||
|
)
|
||||||
|
|
||||||
|
# DeepSeek-V3 weight shapes
|
||||||
|
DEEPSEEK_V3_SHAPES = [
|
||||||
|
(512 + 64, 7168),
|
||||||
|
(2112, 7168),
|
||||||
|
((128 + 64) * 128, 7168),
|
||||||
|
(128 * (128 + 128), 512),
|
||||||
|
(7168, 16384),
|
||||||
|
(7168, 18432),
|
||||||
|
(18432 * 2, 7168),
|
||||||
|
(24576, 1536),
|
||||||
|
(12288, 7168),
|
||||||
|
(4096, 7168),
|
||||||
|
(7168, 2048),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def build_w8a8_block_fp8_runner(M, N, K, block_size, device, use_cutlass):
|
||||||
|
"""Build runner function for w8a8 block fp8 matmul."""
|
||||||
|
factor_for_scale = 1e-2
|
||||||
|
|
||||||
|
fp8_info = torch.finfo(torch.float8_e4m3fn)
|
||||||
|
fp8_max, fp8_min = fp8_info.max, fp8_info.min
|
||||||
|
|
||||||
|
# Create random FP8 tensors
|
||||||
|
A_ref = (torch.rand(M, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
|
||||||
|
|
||||||
|
B_ref = (torch.rand(N, K, dtype=torch.bfloat16, device=device) - 0.5) * 2 * fp8_max
|
||||||
|
B = B_ref.clamp(min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn)
|
||||||
|
|
||||||
|
# Create scales
|
||||||
|
block_n, block_k = block_size[0], block_size[1]
|
||||||
|
n_tiles = (N + block_n - 1) // block_n
|
||||||
|
k_tiles = (K + block_k - 1) // block_k
|
||||||
|
|
||||||
|
Bs = (
|
||||||
|
torch.rand(n_tiles, k_tiles, dtype=torch.float32, device=device)
|
||||||
|
* factor_for_scale
|
||||||
|
)
|
||||||
|
|
||||||
|
# SM90 CUTLASS requires row-major format for scales
|
||||||
|
if use_cutlass and current_platform.is_device_capability(90):
|
||||||
|
Bs = Bs.T.contiguous()
|
||||||
|
|
||||||
|
def run():
|
||||||
|
if use_cutlass:
|
||||||
|
return apply_w8a8_block_fp8_linear(
|
||||||
|
A_ref, B, block_size, Bs, cutlass_block_fp8_supported=True
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
return apply_w8a8_block_fp8_linear(
|
||||||
|
A_ref, B, block_size, Bs, cutlass_block_fp8_supported=False
|
||||||
|
)
|
||||||
|
|
||||||
|
return run
|
||||||
|
|
||||||
|
|
||||||
|
# Determine available providers
|
||||||
|
available_providers = ["torch-bf16", "w8a8-block-fp8-triton"]
|
||||||
|
plot_title = "BF16 vs W8A8 Block FP8 GEMMs"
|
||||||
|
|
||||||
|
if CUTLASS_BLOCK_FP8_SUPPORTED:
|
||||||
|
available_providers.append("w8a8-block-fp8-cutlass")
|
||||||
|
|
||||||
|
|
||||||
|
@vllm_triton.testing.perf_report(
|
||||||
|
vllm_triton.testing.Benchmark(
|
||||||
|
x_names=["batch_size"],
|
||||||
|
x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
|
||||||
|
x_log=False,
|
||||||
|
line_arg="provider",
|
||||||
|
line_vals=available_providers,
|
||||||
|
line_names=available_providers,
|
||||||
|
ylabel="TFLOP/s (larger is better)",
|
||||||
|
plot_name="BF16 vs W8A8 Block FP8 GEMMs",
|
||||||
|
args={},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
def benchmark_tflops(batch_size, provider, N, K, block_size=(128, 128)):
|
||||||
|
M = batch_size
|
||||||
|
device = "cuda"
|
||||||
|
|
||||||
|
quantiles = [0.5, 0.2, 0.8]
|
||||||
|
|
||||||
|
if provider == "torch-bf16":
|
||||||
|
a = torch.randn((M, K), device=device, dtype=torch.bfloat16)
|
||||||
|
b = torch.randn((N, K), device=device, dtype=torch.bfloat16)
|
||||||
|
ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
|
||||||
|
lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
|
||||||
|
)
|
||||||
|
elif provider == "w8a8-block-fp8-triton":
|
||||||
|
run_w8a8_triton = build_w8a8_block_fp8_runner(
|
||||||
|
M, N, K, block_size, device, use_cutlass=False
|
||||||
|
)
|
||||||
|
ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
|
||||||
|
lambda: run_w8a8_triton(), quantiles=quantiles
|
||||||
|
)
|
||||||
|
elif provider == "w8a8-block-fp8-cutlass":
|
||||||
|
run_w8a8_cutlass = build_w8a8_block_fp8_runner(
|
||||||
|
M, N, K, block_size, device, use_cutlass=True
|
||||||
|
)
|
||||||
|
ms, min_ms, max_ms = vllm_triton.testing.do_bench_cudagraph(
|
||||||
|
lambda: run_w8a8_cutlass(), quantiles=quantiles
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown provider: {provider}")
|
||||||
|
|
||||||
|
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
|
||||||
|
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
block_size = (128, 128)
|
||||||
|
|
||||||
|
for N, K in DEEPSEEK_V3_SHAPES:
|
||||||
|
print(f"\nBenchmarking DeepSeek-V3, N={N} K={K}")
|
||||||
|
|
||||||
|
print(f"TFLOP/s comparison (block_size={block_size}):")
|
||||||
|
benchmark_tflops.run(
|
||||||
|
print_data=True,
|
||||||
|
# show_plots=False,
|
||||||
|
# save_path=f"bench_w8a8_block_fp8_tflops_n{N}_k{K}",
|
||||||
|
N=N,
|
||||||
|
K=K,
|
||||||
|
block_size=block_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\nBenchmark finished!")
|
||||||
@ -3,6 +3,7 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import copy
|
import copy
|
||||||
import itertools
|
import itertools
|
||||||
|
import os
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from weight_shapes import WEIGHT_SHAPES
|
from weight_shapes import WEIGHT_SHAPES
|
||||||
@ -23,21 +24,45 @@ PROVIDER_CFGS = {
|
|||||||
"torch-bf16": dict(enabled=True),
|
"torch-bf16": dict(enabled=True),
|
||||||
"nvfp4": dict(no_a_quant=False, enabled=True),
|
"nvfp4": dict(no_a_quant=False, enabled=True),
|
||||||
"nvfp4-noquant": dict(no_a_quant=True, enabled=True),
|
"nvfp4-noquant": dict(no_a_quant=True, enabled=True),
|
||||||
|
"fbgemm-nvfp4": dict(fbgemm=True, no_a_quant=False, enabled=True),
|
||||||
|
"fbgemm-nvfp4-noquant": dict(fbgemm=True, no_a_quant=True, enabled=True),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
_needs_fbgemm = any(
|
||||||
|
v.get("fbgemm", False) for v in PROVIDER_CFGS.values() if v.get("enabled", False)
|
||||||
|
)
|
||||||
|
if _needs_fbgemm:
|
||||||
|
try:
|
||||||
|
from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import (
|
||||||
|
triton_scale_nvfp4_quant,
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
print(
|
||||||
|
"WARNING: FBGEMM providers are enabled but fbgemm_gpu is not installed. "
|
||||||
|
"These providers will be skipped. Please install fbgemm_gpu with: "
|
||||||
|
"'pip install fbgemm-gpu-genai' to run them."
|
||||||
|
)
|
||||||
|
# Disable FBGEMM providers so the benchmark can run.
|
||||||
|
for cfg in PROVIDER_CFGS.values():
|
||||||
|
if cfg.get("fbgemm"):
|
||||||
|
cfg["enabled"] = False
|
||||||
|
|
||||||
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
|
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
|
||||||
|
|
||||||
|
|
||||||
def _quant_weight_nvfp4(b: torch.Tensor, device: str):
|
def _quant_weight_nvfp4(b: torch.Tensor, device: str, cfg):
|
||||||
# Compute global scale for weight
|
# Compute global scale for weight
|
||||||
b_amax = torch.abs(b).max().to(torch.float32)
|
b_amax = torch.abs(b).max().to(torch.float32)
|
||||||
b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
|
b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
|
||||||
b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
|
if "fbgemm" in cfg and cfg["fbgemm"]:
|
||||||
|
b_fp4, scale_b_fp4 = triton_scale_nvfp4_quant(b, b_global_scale)
|
||||||
|
else:
|
||||||
|
b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
|
||||||
return b_fp4, scale_b_fp4, b_global_scale
|
return b_fp4, scale_b_fp4, b_global_scale
|
||||||
|
|
||||||
|
|
||||||
def build_nvfp4_runner(cfg, a, b, dtype, device):
|
def build_nvfp4_runner(cfg, a, b, dtype, device):
|
||||||
b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device)
|
b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device, cfg)
|
||||||
|
|
||||||
# Compute global scale for activation
|
# Compute global scale for activation
|
||||||
# NOTE: This is generally provided ahead-of-time by the model checkpoint.
|
# NOTE: This is generally provided ahead-of-time by the model checkpoint.
|
||||||
@ -46,6 +71,35 @@ def build_nvfp4_runner(cfg, a, b, dtype, device):
|
|||||||
|
|
||||||
# Alpha for the GEMM operation
|
# Alpha for the GEMM operation
|
||||||
alpha = 1.0 / (a_global_scale * b_global_scale)
|
alpha = 1.0 / (a_global_scale * b_global_scale)
|
||||||
|
if "fbgemm" in cfg and cfg["fbgemm"]:
|
||||||
|
if cfg["no_a_quant"]:
|
||||||
|
a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
|
||||||
|
|
||||||
|
def run():
|
||||||
|
return torch.ops.fbgemm.f4f4bf16(
|
||||||
|
a_fp4,
|
||||||
|
b_fp4,
|
||||||
|
scale_a_fp4,
|
||||||
|
scale_b_fp4,
|
||||||
|
global_scale=alpha,
|
||||||
|
use_mx=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
return run
|
||||||
|
else:
|
||||||
|
|
||||||
|
def run():
|
||||||
|
a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
|
||||||
|
return torch.ops.fbgemm.f4f4bf16(
|
||||||
|
a_fp4,
|
||||||
|
b_fp4,
|
||||||
|
scale_a_fp4,
|
||||||
|
scale_b_fp4,
|
||||||
|
global_scale=alpha,
|
||||||
|
use_mx=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
return run
|
||||||
|
|
||||||
if cfg["no_a_quant"]:
|
if cfg["no_a_quant"]:
|
||||||
# Pre-quantize activation
|
# Pre-quantize activation
|
||||||
@ -130,10 +184,13 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
for K, N, model in prepare_shapes(args):
|
for K, N, model in prepare_shapes(args):
|
||||||
print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:")
|
print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:")
|
||||||
|
save_dir = f"bench_nvfp4_res_n{N}_k{K}"
|
||||||
|
os.makedirs(save_dir, exist_ok=True)
|
||||||
|
|
||||||
benchmark.run(
|
benchmark.run(
|
||||||
print_data=True,
|
print_data=True,
|
||||||
show_plots=True,
|
show_plots=True,
|
||||||
save_path=f"bench_nvfp4_res_n{N}_k{K}",
|
save_path=save_dir,
|
||||||
N=N,
|
N=N,
|
||||||
K=K,
|
K=K,
|
||||||
)
|
)
|
||||||
|
|||||||
@ -2,14 +2,25 @@
|
|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import itertools
|
import itertools
|
||||||
from typing import Callable
|
from typing import Callable
|
||||||
|
from unittest.mock import patch
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
|
||||||
from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
|
|
||||||
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
|
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
|
||||||
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
|
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
|
||||||
from vllm.triton_utils import triton
|
from vllm.triton_utils import triton
|
||||||
|
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
|
def with_triton_mode(fn):
|
||||||
|
"""Temporarily force the Triton fallback path"""
|
||||||
|
|
||||||
|
def wrapped(*args, **kwargs):
|
||||||
|
with patch("vllm.platforms.current_platform.is_cuda", return_value=False):
|
||||||
|
return fn(*args, **kwargs)
|
||||||
|
|
||||||
|
return wrapped
|
||||||
|
|
||||||
|
|
||||||
# TODO(luka): use standalone_compile utility
|
# TODO(luka): use standalone_compile utility
|
||||||
@ -21,78 +32,238 @@ def with_dyn_arg(fn: Callable, arg_index: int, dim_index: int):
|
|||||||
return inner
|
return inner
|
||||||
|
|
||||||
|
|
||||||
torch._dynamo.config.recompile_limit = 8888
|
def bench_compile(fn: Callable):
|
||||||
compilation_config = CompilationConfig(custom_ops=["none"])
|
# recompile for different shapes
|
||||||
with set_current_vllm_config(VllmConfig(compilation_config=compilation_config)):
|
fwd = torch.compile(fn, fullgraph=True, dynamic=False)
|
||||||
torch_per_token_quant_fp8 = torch.compile(
|
|
||||||
QuantFP8(False, GroupShape.PER_TOKEN),
|
|
||||||
fullgraph=True,
|
|
||||||
dynamic=False, # recompile for different shapes
|
|
||||||
)
|
|
||||||
|
|
||||||
# First dim is explicitly dynamic to simulate vLLM usage
|
# First dim is explicitly dynamic to simulate vLLM usage
|
||||||
torch_per_token_quant_fp8 = with_dyn_arg(torch_per_token_quant_fp8, 0, 0)
|
return with_dyn_arg(fwd, 0, 0)
|
||||||
|
|
||||||
|
|
||||||
def cuda_per_token_quant_fp8(
|
torch._dynamo.config.recompile_limit = 8888
|
||||||
input: torch.Tensor,
|
|
||||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
|
||||||
return ops.scaled_fp8_quant(input)
|
|
||||||
|
|
||||||
|
|
||||||
def calculate_diff(batch_size: int, seq_len: int):
|
def calculate_diff(
|
||||||
"""Calculate difference between Triton and CUDA implementations."""
|
batch_size: int,
|
||||||
|
hidden_size: int,
|
||||||
|
group_shape: GroupShape,
|
||||||
|
dtype: torch.dtype,
|
||||||
|
):
|
||||||
|
"""Calculate the difference between Inductor and CUDA implementations."""
|
||||||
device = torch.device("cuda")
|
device = torch.device("cuda")
|
||||||
x = torch.rand((batch_size * seq_len, 4096), dtype=torch.float16, device=device)
|
x = torch.randn((batch_size, hidden_size), dtype=dtype, device=device)
|
||||||
|
|
||||||
torch_out, torch_scale = torch_per_token_quant_fp8(x)
|
quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False)
|
||||||
cuda_out, cuda_scale = cuda_per_token_quant_fp8(x)
|
|
||||||
|
|
||||||
if torch.allclose(
|
torch_out, torch_scale = bench_compile(quant_fp8.forward_native)(x)
|
||||||
cuda_out.to(torch.float32), torch_out.to(torch.float32), rtol=1e-3, atol=1e-5
|
torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x)
|
||||||
) and torch.allclose(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5):
|
cuda_out, cuda_scale = quant_fp8.forward_cuda(x)
|
||||||
|
|
||||||
|
try:
|
||||||
|
torch.testing.assert_close(
|
||||||
|
cuda_out.to(torch.float32),
|
||||||
|
torch_out.to(torch.float32),
|
||||||
|
rtol=1e-3,
|
||||||
|
atol=1e-5,
|
||||||
|
)
|
||||||
|
torch.testing.assert_close(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5)
|
||||||
|
torch.testing.assert_close(
|
||||||
|
cuda_out.to(torch.float32),
|
||||||
|
torch_eager_out.to(torch.float32),
|
||||||
|
rtol=1e-3,
|
||||||
|
atol=1e-5,
|
||||||
|
)
|
||||||
|
torch.testing.assert_close(cuda_scale, torch_eager_scale, rtol=1e-3, atol=1e-5)
|
||||||
print("✅ All implementations match")
|
print("✅ All implementations match")
|
||||||
else:
|
except AssertionError as e:
|
||||||
print("❌ Implementations differ")
|
print("❌ Implementations differ")
|
||||||
|
print(e)
|
||||||
|
|
||||||
|
|
||||||
batch_size_range = [1, 16, 32, 64, 128]
|
configs = []
|
||||||
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
|
|
||||||
|
|
||||||
configs = list(itertools.product(batch_size_range, seq_len_range))
|
|
||||||
|
|
||||||
|
|
||||||
@triton.testing.perf_report(
|
def benchmark_quantization(
|
||||||
triton.testing.Benchmark(
|
batch_size,
|
||||||
x_names=["batch_size", "seq_len"],
|
hidden_size,
|
||||||
x_vals=configs,
|
provider,
|
||||||
line_arg="provider",
|
group_shape: GroupShape,
|
||||||
line_vals=["torch", "cuda"],
|
col_major: bool,
|
||||||
line_names=["Torch", "CUDA"],
|
dtype: torch.dtype,
|
||||||
styles=[("blue", "-"), ("green", "-")],
|
):
|
||||||
ylabel="us",
|
|
||||||
plot_name="per-token-dynamic-quant-fp8-performance",
|
|
||||||
args={},
|
|
||||||
)
|
|
||||||
)
|
|
||||||
def benchmark_quantization(batch_size, seq_len, provider):
|
|
||||||
dtype = torch.float16
|
|
||||||
device = torch.device("cuda")
|
device = torch.device("cuda")
|
||||||
|
|
||||||
x = torch.randn(batch_size * seq_len, 4096, device=device, dtype=dtype)
|
x = torch.randn(batch_size, hidden_size, device=device, dtype=dtype)
|
||||||
|
|
||||||
quantiles = [0.5, 0.2, 0.8]
|
quantiles = [0.5, 0.2, 0.8]
|
||||||
|
quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major)
|
||||||
|
|
||||||
if provider == "torch":
|
if provider == "torch":
|
||||||
fn = lambda: torch_per_token_quant_fp8(x.clone())
|
fn = lambda: bench_compile(quant_fp8.forward_native)(x.clone())
|
||||||
elif provider == "cuda":
|
elif provider == "cuda":
|
||||||
fn = lambda: cuda_per_token_quant_fp8(x.clone())
|
fn = lambda: quant_fp8.forward_cuda(x.clone())
|
||||||
|
elif provider == "triton":
|
||||||
|
if not group_shape.is_per_group():
|
||||||
|
# Triton only supported for per-group
|
||||||
|
return 0, 0, 0
|
||||||
|
|
||||||
|
fn = lambda: with_triton_mode(quant_fp8.forward_cuda)(x.clone())
|
||||||
|
|
||||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(fn, quantiles=quantiles)
|
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(fn, quantiles=quantiles)
|
||||||
|
|
||||||
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
|
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
|
||||||
|
|
||||||
|
|
||||||
|
# TODO(luka) extract to utils
|
||||||
|
def compute_geomean_speedups(
|
||||||
|
df: pd.DataFrame,
|
||||||
|
baseline_col: str,
|
||||||
|
speedup_cols: list[str],
|
||||||
|
groupby_cols: list[str] | None = None,
|
||||||
|
) -> pd.DataFrame:
|
||||||
|
"""
|
||||||
|
Compute geometric mean speedups over a baseline column.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
df: Input dataframe
|
||||||
|
baseline_col: Column to use as baseline
|
||||||
|
speedup_cols: Columns to compute speedups for
|
||||||
|
groupby_cols: Columns to group by. If None, compute over entire df.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
pd.DataFrame with geometric mean speedups
|
||||||
|
"""
|
||||||
|
from scipy.stats import gmean
|
||||||
|
|
||||||
|
def geo_speedup(group: pd.DataFrame) -> pd.Series:
|
||||||
|
ratios = {
|
||||||
|
col: (group[baseline_col] / group[col]).values for col in speedup_cols
|
||||||
|
}
|
||||||
|
return pd.Series({col: gmean(vals) for col, vals in ratios.items()})
|
||||||
|
|
||||||
|
if groupby_cols is None:
|
||||||
|
result = geo_speedup(df).to_frame().T
|
||||||
|
else:
|
||||||
|
result = (
|
||||||
|
df.groupby(groupby_cols)
|
||||||
|
.apply(geo_speedup, include_groups=False)
|
||||||
|
.reset_index()
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
calculate_diff(batch_size=4, seq_len=4096)
|
parser = FlexibleArgumentParser(
|
||||||
benchmark_quantization.run(print_data=True)
|
description="Benchmark the various implementations of QuantFP8 (dynamic-only)"
|
||||||
|
)
|
||||||
|
parser.add_argument("-c", "--check", action="store_true")
|
||||||
|
parser.add_argument(
|
||||||
|
"--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--hidden-sizes",
|
||||||
|
type=int,
|
||||||
|
nargs="+",
|
||||||
|
default=[896, 1024, 2048, 4096, 7168],
|
||||||
|
help="Hidden sizes to benchmark",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--batch-sizes",
|
||||||
|
type=int,
|
||||||
|
nargs="+",
|
||||||
|
default=[1, 16, 128, 512, 1024],
|
||||||
|
help="Batch sizes to benchmark",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--group-sizes",
|
||||||
|
type=int,
|
||||||
|
nargs="+",
|
||||||
|
default=None,
|
||||||
|
help="Group sizes for GroupShape(1,N) to benchmark. "
|
||||||
|
"Use 0 for PER_TENSOR, -1 for PER_TOKEN (default: 0,-1,64,128)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-column-major",
|
||||||
|
action="store_true",
|
||||||
|
help="Disable column-major scales testing",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
assert args
|
||||||
|
|
||||||
|
dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
|
||||||
|
|
||||||
|
hidden_sizes = args.hidden_sizes
|
||||||
|
batch_sizes = args.batch_sizes
|
||||||
|
|
||||||
|
if args.group_sizes is not None:
|
||||||
|
group_shapes = []
|
||||||
|
for size in args.group_sizes:
|
||||||
|
if size == 0:
|
||||||
|
group_shapes.append(GroupShape.PER_TENSOR)
|
||||||
|
elif size == -1:
|
||||||
|
group_shapes.append(GroupShape.PER_TOKEN)
|
||||||
|
else:
|
||||||
|
group_shapes.append(GroupShape(1, size))
|
||||||
|
else:
|
||||||
|
group_shapes = [
|
||||||
|
GroupShape.PER_TENSOR,
|
||||||
|
GroupShape.PER_TOKEN,
|
||||||
|
GroupShape(1, 64),
|
||||||
|
GroupShape(1, 128),
|
||||||
|
]
|
||||||
|
|
||||||
|
column_major_scales = [False] if args.no_column_major else [True, False]
|
||||||
|
|
||||||
|
config_gen = itertools.product(
|
||||||
|
group_shapes,
|
||||||
|
column_major_scales,
|
||||||
|
batch_sizes,
|
||||||
|
hidden_sizes,
|
||||||
|
)
|
||||||
|
|
||||||
|
# filter out column-major scales for non-group, reverse order
|
||||||
|
configs.extend(c[::-1] for c in config_gen if (c[0].is_per_group() or not c[1]))
|
||||||
|
|
||||||
|
print(f"Running {len(configs)} configurations:")
|
||||||
|
print(f" Hidden sizes: {hidden_sizes}")
|
||||||
|
print(f" Batch sizes: {batch_sizes}")
|
||||||
|
print(f" Group shapes: {[str(g) for g in group_shapes]}")
|
||||||
|
print(f" Column major scales: {column_major_scales}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
if args.check:
|
||||||
|
for group_shape in group_shapes:
|
||||||
|
group_size = group_shape[1]
|
||||||
|
print(f"{group_size=}")
|
||||||
|
calculate_diff(
|
||||||
|
batch_size=4, hidden_size=4096, group_shape=group_shape, dtype=dtype
|
||||||
|
)
|
||||||
|
|
||||||
|
benchmark = triton.testing.perf_report(
|
||||||
|
triton.testing.Benchmark(
|
||||||
|
x_names=["hidden_size", "batch_size", "col_major", "group_shape"],
|
||||||
|
x_vals=configs,
|
||||||
|
line_arg="provider",
|
||||||
|
line_vals=["torch", "cuda", "triton"],
|
||||||
|
line_names=["Torch (Compiled)", "CUDA", "Triton"],
|
||||||
|
styles=[("blue", "-"), ("green", "-"), ("black", "-")],
|
||||||
|
ylabel="us",
|
||||||
|
plot_name="QuantFP8 performance",
|
||||||
|
args={},
|
||||||
|
)
|
||||||
|
)(benchmark_quantization)
|
||||||
|
|
||||||
|
df = benchmark.run(print_data=True, dtype=dtype, return_df=True)
|
||||||
|
|
||||||
|
# Print geomean speedups
|
||||||
|
geo_table_grouped = compute_geomean_speedups(
|
||||||
|
df,
|
||||||
|
baseline_col="Torch (Compiled)",
|
||||||
|
speedup_cols=["CUDA", "Triton"],
|
||||||
|
groupby_cols=["col_major", "group_shape"],
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Speedup over Torch (Compiled)")
|
||||||
|
print(geo_table_grouped.to_string(index=False))
|
||||||
|
|||||||
104
benchmarks/kernels/benchmark_activation.py
Normal file
104
benchmarks/kernels/benchmark_activation.py
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
# benchmark custom activation op performance
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
import vllm.model_executor.layers.activation # noqa F401
|
||||||
|
from vllm.model_executor.custom_op import CustomOp
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.triton_utils import triton
|
||||||
|
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
||||||
|
|
||||||
|
batch_size_range = [1, 16, 32, 64, 128]
|
||||||
|
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
|
||||||
|
intermediate_size = [3072, 9728, 12288]
|
||||||
|
configs = list(itertools.product(batch_size_range, seq_len_range, intermediate_size))
|
||||||
|
|
||||||
|
|
||||||
|
def benchmark_activation(
|
||||||
|
batch_size: int,
|
||||||
|
seq_len: int,
|
||||||
|
intermediate_size: int,
|
||||||
|
provider: str,
|
||||||
|
func_name: str,
|
||||||
|
dtype: torch.dtype,
|
||||||
|
):
|
||||||
|
device = "cuda"
|
||||||
|
num_tokens = batch_size * seq_len
|
||||||
|
dim = intermediate_size
|
||||||
|
current_platform.seed_everything(42)
|
||||||
|
torch.set_default_device(device)
|
||||||
|
|
||||||
|
if func_name == "gelu_and_mul":
|
||||||
|
layer = CustomOp.op_registry[func_name](approximate="none")
|
||||||
|
elif func_name == "gelu_and_mul_tanh":
|
||||||
|
layer = CustomOp.op_registry["gelu_and_mul"](approximate="tanh")
|
||||||
|
elif func_name == "fatrelu_and_mul":
|
||||||
|
threshold = 0.5
|
||||||
|
layer = CustomOp.op_registry[func_name](threshold)
|
||||||
|
else:
|
||||||
|
layer = CustomOp.op_registry[func_name]()
|
||||||
|
|
||||||
|
x = torch.randn(num_tokens, dim, dtype=dtype, device=device)
|
||||||
|
compiled_layer = torch.compile(layer.forward_native)
|
||||||
|
|
||||||
|
if provider == "custom":
|
||||||
|
fn = lambda: layer(x)
|
||||||
|
elif provider == "compiled":
|
||||||
|
fn = lambda: compiled_layer(x)
|
||||||
|
|
||||||
|
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||||
|
fn, quantiles=[0.5, 0.2, 0.8]
|
||||||
|
)
|
||||||
|
return ms, max_ms, min_ms
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = FlexibleArgumentParser(description="Benchmark the custom activation op.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--func-name",
|
||||||
|
type=str,
|
||||||
|
choices=[
|
||||||
|
"mul_and_silu",
|
||||||
|
"silu_and_mul",
|
||||||
|
"gelu_and_mul",
|
||||||
|
"gelu_and_mul_tanh",
|
||||||
|
"fatrelu_and_mul",
|
||||||
|
"swigluoai_and_mul",
|
||||||
|
"gelu_new",
|
||||||
|
"gelu_fast",
|
||||||
|
"quick_gelu",
|
||||||
|
],
|
||||||
|
default="silu_and_mul",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
assert args
|
||||||
|
|
||||||
|
func_name = args.func_name
|
||||||
|
dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
|
||||||
|
|
||||||
|
perf_report = triton.testing.perf_report(
|
||||||
|
triton.testing.Benchmark(
|
||||||
|
x_names=["batch_size", "seq_len", "intermediate_size"],
|
||||||
|
x_vals=configs,
|
||||||
|
line_arg="provider",
|
||||||
|
line_vals=["custom", "compiled"],
|
||||||
|
line_names=["Custom OP", "Compiled"],
|
||||||
|
styles=[("blue", "-"), ("green", "-")],
|
||||||
|
ylabel="ms",
|
||||||
|
plot_name=f"{func_name}-op-performance",
|
||||||
|
args={},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
perf_report(
|
||||||
|
lambda batch_size, seq_len, intermediate_size, provider: benchmark_activation(
|
||||||
|
batch_size, seq_len, intermediate_size, provider, func_name, dtype
|
||||||
|
)
|
||||||
|
).run(print_data=True)
|
||||||
@ -13,6 +13,10 @@ import torch.utils.benchmark as benchmark
|
|||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||||
|
from vllm.model_executor.layers.fused_moe.config import (
|
||||||
|
fp8_w8a8_moe_quant_config,
|
||||||
|
nvfp4_moe_quant_config,
|
||||||
|
)
|
||||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
|
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
|
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
|
||||||
from vllm.scalar_type import scalar_types
|
from vllm.scalar_type import scalar_types
|
||||||
@ -140,6 +144,12 @@ def bench_run(
|
|||||||
a_fp8_scale: torch.Tensor,
|
a_fp8_scale: torch.Tensor,
|
||||||
num_repeats: int,
|
num_repeats: int,
|
||||||
):
|
):
|
||||||
|
quant_config = fp8_w8a8_moe_quant_config(
|
||||||
|
w1_scale=w1_scale,
|
||||||
|
w2_scale=w2_scale,
|
||||||
|
a1_scale=a_fp8_scale,
|
||||||
|
)
|
||||||
|
|
||||||
for _ in range(num_repeats):
|
for _ in range(num_repeats):
|
||||||
fused_experts(
|
fused_experts(
|
||||||
a,
|
a,
|
||||||
@ -147,10 +157,7 @@ def bench_run(
|
|||||||
w2,
|
w2,
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
use_fp8_w8a8=True,
|
quant_config=quant_config,
|
||||||
w1_scale=w1_scale,
|
|
||||||
w2_scale=w2_scale,
|
|
||||||
a1_scale=a_fp8_scale,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def run_cutlass_moe_fp4(
|
def run_cutlass_moe_fp4(
|
||||||
@ -172,25 +179,27 @@ def bench_run(
|
|||||||
device: torch.device,
|
device: torch.device,
|
||||||
num_repeats: int,
|
num_repeats: int,
|
||||||
):
|
):
|
||||||
|
quant_config = nvfp4_moe_quant_config(
|
||||||
|
a1_gscale=a1_gs,
|
||||||
|
a2_gscale=a2_gs,
|
||||||
|
w1_scale=w1_blockscale,
|
||||||
|
w2_scale=w2_blockscale,
|
||||||
|
g1_alphas=w1_gs,
|
||||||
|
g2_alphas=w2_gs,
|
||||||
|
)
|
||||||
for _ in range(num_repeats):
|
for _ in range(num_repeats):
|
||||||
with nvtx.annotate("cutlass_moe_fp4", color="green"):
|
with nvtx.annotate("cutlass_moe_fp4", color="green"):
|
||||||
cutlass_moe_fp4(
|
cutlass_moe_fp4(
|
||||||
a=a,
|
a=a,
|
||||||
a1_gscale=a1_gs,
|
|
||||||
a2_gscale=a2_gs,
|
|
||||||
w1_fp4=w1_fp4,
|
w1_fp4=w1_fp4,
|
||||||
w1_blockscale=w1_blockscale,
|
|
||||||
w1_alphas=w1_gs,
|
|
||||||
w2_fp4=w2_fp4,
|
w2_fp4=w2_fp4,
|
||||||
w2_blockscale=w2_blockscale,
|
|
||||||
w2_alphas=w2_gs,
|
|
||||||
topk_weights=topk_weights,
|
topk_weights=topk_weights,
|
||||||
topk_ids=topk_ids,
|
topk_ids=topk_ids,
|
||||||
m=m,
|
m=m,
|
||||||
n=n,
|
n=n,
|
||||||
k=k,
|
k=k,
|
||||||
e=num_experts,
|
e=num_experts,
|
||||||
device=device,
|
quant_config=quant_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
def run_cutlass_from_graph(
|
def run_cutlass_from_graph(
|
||||||
@ -211,26 +220,29 @@ def bench_run(
|
|||||||
e: int,
|
e: int,
|
||||||
device: torch.device,
|
device: torch.device,
|
||||||
):
|
):
|
||||||
|
quant_config = nvfp4_moe_quant_config(
|
||||||
|
a1_gscale=a1_gs,
|
||||||
|
a2_gscale=a2_gs,
|
||||||
|
w1_scale=w1_blockscale,
|
||||||
|
w2_scale=w2_blockscale,
|
||||||
|
g1_alphas=w1_gs,
|
||||||
|
g2_alphas=w2_gs,
|
||||||
|
)
|
||||||
|
|
||||||
with set_current_vllm_config(
|
with set_current_vllm_config(
|
||||||
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
||||||
):
|
):
|
||||||
return cutlass_moe_fp4(
|
return cutlass_moe_fp4(
|
||||||
a=a,
|
a=a,
|
||||||
a1_gscale=a1_gs,
|
|
||||||
w1_fp4=w1_fp4,
|
w1_fp4=w1_fp4,
|
||||||
w1_blockscale=w1_blockscale,
|
|
||||||
w1_alphas=w1_alphas,
|
|
||||||
a2_gscale=a2_gs,
|
|
||||||
w2_fp4=w2_fp4,
|
w2_fp4=w2_fp4,
|
||||||
w2_blockscale=w2_blockscale,
|
|
||||||
w2_alphas=w2_alphas,
|
|
||||||
topk_weights=topk_weights,
|
topk_weights=topk_weights,
|
||||||
topk_ids=topk_ids,
|
topk_ids=topk_ids,
|
||||||
m=m,
|
m=m,
|
||||||
n=n,
|
n=n,
|
||||||
k=k,
|
k=k,
|
||||||
e=num_experts,
|
e=num_experts,
|
||||||
device=device,
|
quant_config=quant_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
def run_triton_from_graph(
|
def run_triton_from_graph(
|
||||||
@ -246,16 +258,18 @@ def bench_run(
|
|||||||
with set_current_vllm_config(
|
with set_current_vllm_config(
|
||||||
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
||||||
):
|
):
|
||||||
|
quant_config = fp8_w8a8_moe_quant_config(
|
||||||
|
w1_scale=w1_scale,
|
||||||
|
w2_scale=w2_scale,
|
||||||
|
a1_scale=a_fp8_scale,
|
||||||
|
)
|
||||||
return fused_experts(
|
return fused_experts(
|
||||||
a,
|
a,
|
||||||
w1,
|
w1,
|
||||||
w2,
|
w2,
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
use_fp8_w8a8=True,
|
quant_config=quant_config,
|
||||||
w1_scale=w1_scale,
|
|
||||||
w2_scale=w2_scale,
|
|
||||||
a1_scale=a_fp8_scale,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def replay_graph(graph, num_repeats):
|
def replay_graph(graph, num_repeats):
|
||||||
|
|||||||
406
benchmarks/kernels/benchmark_cutlass_moe_fp8.py
Normal file
406
benchmarks/kernels/benchmark_cutlass_moe_fp8.py
Normal file
@ -0,0 +1,406 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
"""
|
||||||
|
Benchmark the performance of the cutlass_moe_fp8 kernel vs the triton_moe
|
||||||
|
kernel. Both kernels take in fp8 quantized weights and 16-bit activations,
|
||||||
|
but use different quantization strategies and backends.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import nvtx
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
|
||||||
|
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
|
||||||
|
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
# Weight shapes for different models: [num_experts, topk, hidden_size,
|
||||||
|
# intermediate_size]
|
||||||
|
WEIGHT_SHAPES_MOE = {
|
||||||
|
"mixtral-8x7b": [
|
||||||
|
[8, 2, 4096, 14336],
|
||||||
|
],
|
||||||
|
"deepseek-v2": [
|
||||||
|
[160, 6, 5120, 12288],
|
||||||
|
],
|
||||||
|
"custom-small": [
|
||||||
|
[8, 2, 2048, 7168],
|
||||||
|
],
|
||||||
|
"glm45-fp8": [
|
||||||
|
[128, 8, 4096, 1408],
|
||||||
|
],
|
||||||
|
"Llama-4-Maverick-17B-128E-Instruct-FP8": [
|
||||||
|
[128, 1, 5120, 8192],
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
DEFAULT_MODELS = [
|
||||||
|
"mixtral-8x7b",
|
||||||
|
]
|
||||||
|
|
||||||
|
DEFAULT_BATCH_SIZES = [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
|
||||||
|
DEFAULT_TP_SIZES = [1]
|
||||||
|
|
||||||
|
PER_ACT_TOKEN_OPTS = [False, True]
|
||||||
|
PER_OUT_CH_OPTS = [False, True]
|
||||||
|
|
||||||
|
FP8_DTYPE = current_platform.fp8_dtype()
|
||||||
|
|
||||||
|
|
||||||
|
def bench_run(
|
||||||
|
results: list,
|
||||||
|
model: str,
|
||||||
|
num_experts: int,
|
||||||
|
topk: int,
|
||||||
|
per_act_token: bool,
|
||||||
|
per_out_ch: bool,
|
||||||
|
mkn: tuple[int, int, int],
|
||||||
|
):
|
||||||
|
(m, k, n) = mkn
|
||||||
|
|
||||||
|
dtype = torch.half
|
||||||
|
device = "cuda"
|
||||||
|
|
||||||
|
# Create input activations
|
||||||
|
a = torch.randn((m, k), device=device, dtype=dtype) / 10
|
||||||
|
|
||||||
|
# Create weights
|
||||||
|
w1 = torch.randn((num_experts, 2 * n, k), device=device, dtype=dtype) / 10
|
||||||
|
w2 = torch.randn((num_experts, k, n), device=device, dtype=dtype) / 10
|
||||||
|
|
||||||
|
# Create FP8 quantized weights and scales for both kernels
|
||||||
|
w1_fp8q = torch.empty((num_experts, 2 * n, k), device=device, dtype=FP8_DTYPE)
|
||||||
|
w2_fp8q = torch.empty((num_experts, k, n), device=device, dtype=FP8_DTYPE)
|
||||||
|
|
||||||
|
# Create scales based on quantization strategy
|
||||||
|
if per_out_ch:
|
||||||
|
# Per-channel quantization
|
||||||
|
w1_scale = torch.empty(
|
||||||
|
(num_experts, 2 * n, 1), device=device, dtype=torch.float32
|
||||||
|
)
|
||||||
|
w2_scale = torch.empty((num_experts, k, 1), device=device, dtype=torch.float32)
|
||||||
|
else:
|
||||||
|
# Per-tensor quantization
|
||||||
|
w1_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
|
||||||
|
w2_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
|
||||||
|
|
||||||
|
# Quantize weights
|
||||||
|
for expert in range(num_experts):
|
||||||
|
if per_out_ch:
|
||||||
|
# Per-channel quantization - not yet implemented properly
|
||||||
|
# For now, fall back to per-tensor quantization
|
||||||
|
w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
|
||||||
|
w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
|
||||||
|
# Expand scalar scales to the expected per-channel shape
|
||||||
|
w1_scale[expert] = w1_scale_temp.expand(2 * n, 1)
|
||||||
|
w2_scale[expert] = w2_scale_temp.expand(k, 1)
|
||||||
|
else:
|
||||||
|
# Per-tensor quantization
|
||||||
|
w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
|
||||||
|
w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
|
||||||
|
# Store scalar scales in [1, 1] tensors
|
||||||
|
w1_scale[expert, 0, 0] = w1_scale_temp
|
||||||
|
w2_scale[expert, 0, 0] = w2_scale_temp
|
||||||
|
|
||||||
|
# Prepare weights for CUTLASS (no transpose needed)
|
||||||
|
w1_fp8q_cutlass = w1_fp8q # Keep original [E, 2N, K]
|
||||||
|
w2_fp8q_cutlass = w2_fp8q # Keep original [E, K, N]
|
||||||
|
|
||||||
|
# Create router scores and get topk
|
||||||
|
score = torch.randn((m, num_experts), device=device, dtype=dtype)
|
||||||
|
topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
|
||||||
|
|
||||||
|
# WORKAROUND: CUTLASS MoE FP8 has issues with per-token quantization
|
||||||
|
# Force per-tensor quantization for all cases to match working e2e setup
|
||||||
|
a1_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
|
||||||
|
a2_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
|
||||||
|
|
||||||
|
# Force per-tensor quantization for all cases
|
||||||
|
per_act_token = False
|
||||||
|
|
||||||
|
# Create stride tensors for CUTLASS
|
||||||
|
ab_strides1 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
|
||||||
|
ab_strides2 = torch.full((num_experts,), n, dtype=torch.int64, device=device)
|
||||||
|
c_strides1 = torch.full((num_experts,), 2 * n, dtype=torch.int64, device=device)
|
||||||
|
c_strides2 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
|
||||||
|
|
||||||
|
def run_triton_moe(
|
||||||
|
a: torch.Tensor,
|
||||||
|
w1: torch.Tensor,
|
||||||
|
w2: torch.Tensor,
|
||||||
|
topk_weights: torch.Tensor,
|
||||||
|
topk_ids: torch.Tensor,
|
||||||
|
w1_scale: torch.Tensor,
|
||||||
|
w2_scale: torch.Tensor,
|
||||||
|
a1_scale: torch.Tensor,
|
||||||
|
a2_scale: torch.Tensor,
|
||||||
|
num_repeats: int,
|
||||||
|
):
|
||||||
|
quant_config = fp8_w8a8_moe_quant_config(
|
||||||
|
w1_scale=w1_scale,
|
||||||
|
w2_scale=w2_scale,
|
||||||
|
a1_scale=a1_scale,
|
||||||
|
a2_scale=a2_scale,
|
||||||
|
per_act_token_quant=per_act_token,
|
||||||
|
per_out_ch_quant=per_out_ch,
|
||||||
|
)
|
||||||
|
|
||||||
|
for _ in range(num_repeats):
|
||||||
|
fused_experts(
|
||||||
|
a,
|
||||||
|
w1,
|
||||||
|
w2,
|
||||||
|
topk_weights,
|
||||||
|
topk_ids,
|
||||||
|
quant_config=quant_config,
|
||||||
|
)
|
||||||
|
|
||||||
|
def run_cutlass_moe_fp8(
|
||||||
|
a: torch.Tensor,
|
||||||
|
w1: torch.Tensor,
|
||||||
|
w2: torch.Tensor,
|
||||||
|
topk_weights: torch.Tensor,
|
||||||
|
topk_ids: torch.Tensor,
|
||||||
|
ab_strides1: torch.Tensor,
|
||||||
|
ab_strides2: torch.Tensor,
|
||||||
|
c_strides1: torch.Tensor,
|
||||||
|
c_strides2: torch.Tensor,
|
||||||
|
w1_scale: torch.Tensor,
|
||||||
|
w2_scale: torch.Tensor,
|
||||||
|
a1_scale: torch.Tensor,
|
||||||
|
a2_scale: torch.Tensor,
|
||||||
|
num_repeats: int,
|
||||||
|
):
|
||||||
|
quant_config = fp8_w8a8_moe_quant_config(
|
||||||
|
w1_scale=w1_scale,
|
||||||
|
w2_scale=w2_scale,
|
||||||
|
a1_scale=a1_scale,
|
||||||
|
a2_scale=a2_scale,
|
||||||
|
per_act_token_quant=per_act_token,
|
||||||
|
per_out_ch_quant=per_out_ch,
|
||||||
|
)
|
||||||
|
|
||||||
|
for _ in range(num_repeats):
|
||||||
|
with nvtx.annotate("cutlass_moe_fp8", color="blue"):
|
||||||
|
cutlass_moe_fp8(
|
||||||
|
a=a,
|
||||||
|
w1_q=w1,
|
||||||
|
w2_q=w2,
|
||||||
|
topk_weights=topk_weights,
|
||||||
|
topk_ids=topk_ids,
|
||||||
|
ab_strides1=ab_strides1,
|
||||||
|
ab_strides2=ab_strides2,
|
||||||
|
c_strides1=c_strides1,
|
||||||
|
c_strides2=c_strides2,
|
||||||
|
quant_config=quant_config,
|
||||||
|
activation="silu",
|
||||||
|
global_num_experts=num_experts,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Pre-create quantization config to avoid creating it inside CUDA graph
|
||||||
|
quant_config = fp8_w8a8_moe_quant_config(
|
||||||
|
w1_scale=w1_scale,
|
||||||
|
w2_scale=w2_scale,
|
||||||
|
a1_scale=a1_scale,
|
||||||
|
a2_scale=a2_scale,
|
||||||
|
per_act_token_quant=per_act_token,
|
||||||
|
per_out_ch_quant=per_out_ch,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create CUDA graphs for CUTLASS (match benchmark_moe.py pattern exactly)
|
||||||
|
cutlass_stream = torch.cuda.Stream()
|
||||||
|
cutlass_graph = torch.cuda.CUDAGraph()
|
||||||
|
with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
|
||||||
|
# Capture 10 invocations like benchmark_moe.py
|
||||||
|
for _ in range(10):
|
||||||
|
cutlass_moe_fp8(
|
||||||
|
a=a,
|
||||||
|
w1_q=w1_fp8q_cutlass,
|
||||||
|
w2_q=w2_fp8q_cutlass,
|
||||||
|
topk_weights=topk_weights,
|
||||||
|
topk_ids=topk_ids,
|
||||||
|
ab_strides1=ab_strides1,
|
||||||
|
ab_strides2=ab_strides2,
|
||||||
|
c_strides1=c_strides1,
|
||||||
|
c_strides2=c_strides2,
|
||||||
|
quant_config=quant_config,
|
||||||
|
activation="silu",
|
||||||
|
global_num_experts=num_experts,
|
||||||
|
)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
# Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
|
||||||
|
triton_stream = torch.cuda.Stream()
|
||||||
|
triton_graph = torch.cuda.CUDAGraph()
|
||||||
|
with torch.cuda.graph(triton_graph, stream=triton_stream):
|
||||||
|
# Capture 10 invocations like benchmark_moe.py
|
||||||
|
for _ in range(10):
|
||||||
|
fused_experts(
|
||||||
|
a,
|
||||||
|
w1_fp8q,
|
||||||
|
w2_fp8q,
|
||||||
|
topk_weights,
|
||||||
|
topk_ids,
|
||||||
|
quant_config=quant_config,
|
||||||
|
)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
|
||||||
|
"""Benchmark CUDA graph using events like benchmark_moe.py"""
|
||||||
|
# Warmup
|
||||||
|
for _ in range(num_warmup):
|
||||||
|
graph.replay()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
# Timing
|
||||||
|
start_event = torch.cuda.Event(enable_timing=True)
|
||||||
|
end_event = torch.cuda.Event(enable_timing=True)
|
||||||
|
|
||||||
|
latencies = []
|
||||||
|
for _ in range(num_iters):
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
start_event.record()
|
||||||
|
graph.replay()
|
||||||
|
end_event.record()
|
||||||
|
end_event.synchronize()
|
||||||
|
latencies.append(start_event.elapsed_time(end_event))
|
||||||
|
|
||||||
|
# Divide by 10 since graph contains 10 calls
|
||||||
|
return sum(latencies) / (num_iters * 10)
|
||||||
|
|
||||||
|
# Benchmark parameters
|
||||||
|
num_warmup = 5
|
||||||
|
num_iters = 100
|
||||||
|
|
||||||
|
# Benchmark only CUDA graphs (more reliable and faster)
|
||||||
|
# Benchmark Triton MoE with CUDA graphs
|
||||||
|
triton_graph_time = bench_cuda_graph(
|
||||||
|
triton_graph, num_warmup=num_warmup, num_iters=num_iters
|
||||||
|
)
|
||||||
|
|
||||||
|
# Benchmark CUTLASS MoE with CUDA graphs
|
||||||
|
cutlass_graph_time = bench_cuda_graph(
|
||||||
|
cutlass_graph, num_warmup=num_warmup, num_iters=num_iters
|
||||||
|
)
|
||||||
|
|
||||||
|
# Convert ms to us and return results
|
||||||
|
triton_time_us = triton_graph_time * 1000
|
||||||
|
cutlass_time_us = cutlass_graph_time * 1000
|
||||||
|
|
||||||
|
return {
|
||||||
|
"batch_size": m,
|
||||||
|
"triton_time_us": triton_time_us,
|
||||||
|
"cutlass_time_us": cutlass_time_us,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
print("Benchmarking models:")
|
||||||
|
for i, model in enumerate(args.models):
|
||||||
|
print(f"[{i}] {model}")
|
||||||
|
|
||||||
|
all_results = []
|
||||||
|
|
||||||
|
for model in args.models:
|
||||||
|
for tp in args.tp_sizes:
|
||||||
|
for layer in WEIGHT_SHAPES_MOE[model]:
|
||||||
|
num_experts = layer[0]
|
||||||
|
topk = layer[1]
|
||||||
|
size_k = layer[2]
|
||||||
|
size_n = layer[3] // tp
|
||||||
|
|
||||||
|
if len(args.limit_k) > 0 and size_k not in args.limit_k:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(args.limit_n) > 0 and size_n not in args.limit_n:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for per_act_token in args.per_act_token_opts:
|
||||||
|
for per_out_ch in args.per_out_ch_opts:
|
||||||
|
print(
|
||||||
|
f"\n=== {model}, experts={num_experts}, topk={topk},"
|
||||||
|
f"per_act={per_act_token}, per_out_ch={per_out_ch} ==="
|
||||||
|
)
|
||||||
|
|
||||||
|
config_results = []
|
||||||
|
for size_m in args.batch_sizes:
|
||||||
|
mkn = (size_m, size_k, size_n)
|
||||||
|
result = bench_run(
|
||||||
|
[], # Not used anymore
|
||||||
|
model,
|
||||||
|
num_experts,
|
||||||
|
topk,
|
||||||
|
per_act_token,
|
||||||
|
per_out_ch,
|
||||||
|
mkn,
|
||||||
|
)
|
||||||
|
if result:
|
||||||
|
config_results.append(result)
|
||||||
|
|
||||||
|
# Print results table for this configuration
|
||||||
|
if config_results:
|
||||||
|
print(
|
||||||
|
f"\n{'Batch Size':<12}"
|
||||||
|
f"{'Triton (us)':<15}"
|
||||||
|
f"{'CUTLASS (us)':<15}"
|
||||||
|
)
|
||||||
|
print("-" * 45)
|
||||||
|
for result in config_results:
|
||||||
|
print(
|
||||||
|
f"{result['batch_size']:<12}"
|
||||||
|
f"{result['triton_time_us']:<15.2f}"
|
||||||
|
f"{result['cutlass_time_us']:<15.2f}"
|
||||||
|
)
|
||||||
|
|
||||||
|
all_results.extend(config_results)
|
||||||
|
|
||||||
|
print(f"\nTotal benchmarks completed: {len(all_results)}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = FlexibleArgumentParser(
|
||||||
|
description="""Benchmark CUTLASS FP8 MOE vs Triton FP8 FUSED MOE
|
||||||
|
across specified models/shapes/batches
|
||||||
|
|
||||||
|
Example usage:
|
||||||
|
python benchmark_cutlass_moe_fp8.py \
|
||||||
|
--model "Llama-4-Maverick-17B-128E-Instruct-FP8" \
|
||||||
|
--tp-sizes 8 \
|
||||||
|
--batch-size 2 4 8 \
|
||||||
|
--per-act-token-opts false \
|
||||||
|
--per-out-ch-opts false
|
||||||
|
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--models",
|
||||||
|
nargs="+",
|
||||||
|
type=str,
|
||||||
|
default=DEFAULT_MODELS,
|
||||||
|
choices=WEIGHT_SHAPES_MOE.keys(),
|
||||||
|
)
|
||||||
|
parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES)
|
||||||
|
parser.add_argument(
|
||||||
|
"--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
|
||||||
|
)
|
||||||
|
parser.add_argument("--limit-k", nargs="+", type=int, default=[])
|
||||||
|
parser.add_argument("--limit-n", nargs="+", type=int, default=[])
|
||||||
|
parser.add_argument(
|
||||||
|
"--per-act-token-opts",
|
||||||
|
nargs="+",
|
||||||
|
type=lambda x: x.lower() == "true",
|
||||||
|
default=[False, True],
|
||||||
|
help="Per-activation token quantization options (true/false)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--per-out-ch-opts",
|
||||||
|
nargs="+",
|
||||||
|
type=lambda x: x.lower() == "true",
|
||||||
|
default=[False, True],
|
||||||
|
help="Per-output channel quantization options (true/false)",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args)
|
||||||
508
benchmarks/kernels/benchmark_device_communicators.py
Normal file
508
benchmarks/kernels/benchmark_device_communicators.py
Normal file
@ -0,0 +1,508 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
"""
|
||||||
|
Benchmark script for device communicators:
|
||||||
|
CustomAllreduce (oneshot, twoshot), PyNcclCommunicator,
|
||||||
|
and SymmMemCommunicator (multimem, two-shot).
|
||||||
|
|
||||||
|
for NCCL symmetric memory you need to set the environment variables
|
||||||
|
NCCL_NVLS_ENABLE=1 NCCL_CUMEM_ENABLE=1 VLLM_USE_NCCL_SYMM_MEM=1, otherwise NCCL does
|
||||||
|
not use fast NVLS implementation for all reduce.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
torchrun --nproc_per_node=<N> benchmark_device_communicators.py [options]
|
||||||
|
|
||||||
|
Example:
|
||||||
|
torchrun --nproc_per_node=2 benchmark_device_communicators.py
|
||||||
|
--sequence-lengths 512 1024 2048 --num-warmup 10 --num-trials 100
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from contextlib import nullcontext
|
||||||
|
from typing import Callable, Optional
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.distributed as dist
|
||||||
|
from torch.distributed import ProcessGroup
|
||||||
|
|
||||||
|
from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
|
||||||
|
from vllm.distributed.device_communicators.pynccl import (
|
||||||
|
PyNcclCommunicator,
|
||||||
|
register_nccl_symmetric_ops,
|
||||||
|
)
|
||||||
|
from vllm.distributed.device_communicators.pynccl_allocator import (
|
||||||
|
set_graph_pool_id,
|
||||||
|
)
|
||||||
|
from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
|
||||||
|
from vllm.logger import init_logger
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
# Default sequence lengths to benchmark
|
||||||
|
DEFAULT_SEQUENCE_LENGTHS = [128, 512, 1024, 2048, 4096, 8192]
|
||||||
|
|
||||||
|
# Fixed hidden size and dtype for all benchmarks
|
||||||
|
HIDDEN_SIZE = 8192
|
||||||
|
BENCHMARK_DTYPE = torch.bfloat16
|
||||||
|
|
||||||
|
# CUDA graph settings
|
||||||
|
CUDA_GRAPH_CAPTURE_CYCLES = 10
|
||||||
|
|
||||||
|
|
||||||
|
class CommunicatorBenchmark:
|
||||||
|
"""Benchmark class for testing device communicators."""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
rank: int,
|
||||||
|
world_size: int,
|
||||||
|
device: torch.device,
|
||||||
|
cpu_group: ProcessGroup,
|
||||||
|
sequence_lengths: list[int],
|
||||||
|
):
|
||||||
|
self.rank = rank
|
||||||
|
self.world_size = world_size
|
||||||
|
self.device = device
|
||||||
|
self.cpu_group = cpu_group
|
||||||
|
|
||||||
|
# Calculate max_size_override based on largest sequence length
|
||||||
|
max_seq_len = max(sequence_lengths)
|
||||||
|
max_tensor_elements = max_seq_len * HIDDEN_SIZE
|
||||||
|
self.max_size_override = max_tensor_elements * BENCHMARK_DTYPE.itemsize + 1
|
||||||
|
|
||||||
|
# Initialize communicators
|
||||||
|
self.custom_allreduce = None
|
||||||
|
self.pynccl_comm = None
|
||||||
|
self.symm_mem_comm = None
|
||||||
|
self.symm_mem_comm_multimem = None
|
||||||
|
self.symm_mem_comm_two_shot = None
|
||||||
|
|
||||||
|
self._init_communicators()
|
||||||
|
|
||||||
|
def _init_communicators(self):
|
||||||
|
"""Initialize all available communicators."""
|
||||||
|
try:
|
||||||
|
self.custom_allreduce = CustomAllreduce(
|
||||||
|
group=self.cpu_group,
|
||||||
|
device=self.device,
|
||||||
|
max_size=self.max_size_override,
|
||||||
|
)
|
||||||
|
if not self.custom_allreduce.disabled:
|
||||||
|
logger.info("Rank %s: CustomAllreduce initialized", self.rank)
|
||||||
|
else:
|
||||||
|
logger.info("Rank %s: CustomAllreduce disabled", self.rank)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
"Rank %s: Failed to initialize CustomAllreduce: %s", self.rank, e
|
||||||
|
)
|
||||||
|
self.custom_allreduce = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.pynccl_comm = PyNcclCommunicator(
|
||||||
|
group=self.cpu_group, device=self.device
|
||||||
|
)
|
||||||
|
if not self.pynccl_comm.disabled:
|
||||||
|
logger.info("Rank %s: PyNcclCommunicator initialized", self.rank)
|
||||||
|
register_nccl_symmetric_ops(self.pynccl_comm)
|
||||||
|
else:
|
||||||
|
logger.info("Rank %s: PyNcclCommunicator disabled", self.rank)
|
||||||
|
self.pynccl_comm = None
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
"Rank %s: Failed to initialize PyNcclCommunicator: %s", self.rank, e
|
||||||
|
)
|
||||||
|
self.pynccl_comm = None
|
||||||
|
|
||||||
|
# Initialize variants for SymmMemCommunicator
|
||||||
|
try:
|
||||||
|
self.symm_mem_comm_multimem = SymmMemCommunicator(
|
||||||
|
group=self.cpu_group,
|
||||||
|
device=self.device,
|
||||||
|
force_multimem=True,
|
||||||
|
max_size_override=self.max_size_override,
|
||||||
|
)
|
||||||
|
if not self.symm_mem_comm_multimem.disabled:
|
||||||
|
logger.info(
|
||||||
|
"Rank %s: SymmMemCommunicator (multimem) initialized", self.rank
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.symm_mem_comm_multimem = None
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
"Rank %s: Failed to initialize SymmMemCommunicator (multimem): %s",
|
||||||
|
self.rank,
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
self.symm_mem_comm_multimem = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.symm_mem_comm_two_shot = SymmMemCommunicator(
|
||||||
|
group=self.cpu_group,
|
||||||
|
device=self.device,
|
||||||
|
force_multimem=False,
|
||||||
|
max_size_override=self.max_size_override,
|
||||||
|
)
|
||||||
|
if not self.symm_mem_comm_two_shot.disabled:
|
||||||
|
logger.info(
|
||||||
|
"Rank %s: SymmMemCommunicator (two_shot) initialized", self.rank
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
self.symm_mem_comm_two_shot = None
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(
|
||||||
|
"Rank %s: Failed to initialize SymmMemCommunicator (two_shot): %s",
|
||||||
|
self.rank,
|
||||||
|
e,
|
||||||
|
)
|
||||||
|
self.symm_mem_comm_two_shot = None
|
||||||
|
|
||||||
|
def benchmark_allreduce(
|
||||||
|
self, sequence_length: int, num_warmup: int, num_trials: int
|
||||||
|
) -> dict[str, float]:
|
||||||
|
"""Benchmark allreduce operations for all available communicators."""
|
||||||
|
|
||||||
|
results = {}
|
||||||
|
|
||||||
|
# Define communicators with their benchmark functions
|
||||||
|
communicators = []
|
||||||
|
|
||||||
|
if self.custom_allreduce is not None:
|
||||||
|
comm = self.custom_allreduce
|
||||||
|
# CustomAllreduce one-shot
|
||||||
|
communicators.append(
|
||||||
|
(
|
||||||
|
"ca_1stage",
|
||||||
|
lambda t, c=comm: c.custom_all_reduce(t),
|
||||||
|
lambda t, c=comm: c.should_custom_ar(t),
|
||||||
|
comm.capture(),
|
||||||
|
"1stage", # env variable value
|
||||||
|
)
|
||||||
|
)
|
||||||
|
# CustomAllreduce two-shot
|
||||||
|
communicators.append(
|
||||||
|
(
|
||||||
|
"ca_2stage",
|
||||||
|
lambda t, c=comm: c.custom_all_reduce(t),
|
||||||
|
lambda t, c=comm: c.should_custom_ar(t),
|
||||||
|
comm.capture(),
|
||||||
|
"2stage", # env variable value
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.pynccl_comm is not None:
|
||||||
|
comm = self.pynccl_comm
|
||||||
|
communicators.append(
|
||||||
|
(
|
||||||
|
"pynccl",
|
||||||
|
lambda t, c=comm: c.all_reduce(t),
|
||||||
|
lambda t: True, # Always available if initialized
|
||||||
|
nullcontext(),
|
||||||
|
None, # no env variable needed
|
||||||
|
)
|
||||||
|
)
|
||||||
|
communicators.append(
|
||||||
|
(
|
||||||
|
"pynccl-symm",
|
||||||
|
lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t),
|
||||||
|
lambda t: True, # Always available if initialized
|
||||||
|
nullcontext(),
|
||||||
|
None, # no env variable needed
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.symm_mem_comm_multimem is not None:
|
||||||
|
comm = self.symm_mem_comm_multimem
|
||||||
|
communicators.append(
|
||||||
|
(
|
||||||
|
"symm_mem_multimem",
|
||||||
|
lambda t, c=comm: c.all_reduce(t),
|
||||||
|
lambda t, c=comm: c.should_use_symm_mem(t),
|
||||||
|
nullcontext(),
|
||||||
|
None, # no env variable needed
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.symm_mem_comm_two_shot is not None:
|
||||||
|
comm = self.symm_mem_comm_two_shot
|
||||||
|
communicators.append(
|
||||||
|
(
|
||||||
|
"symm_mem_two_shot",
|
||||||
|
lambda t, c=comm: c.all_reduce(t),
|
||||||
|
lambda t, c=comm: c.should_use_symm_mem(t),
|
||||||
|
nullcontext(),
|
||||||
|
None, # no env variable needed
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Benchmark each communicator
|
||||||
|
for name, allreduce_fn, should_use_fn, context, env_var in communicators:
|
||||||
|
# Set environment variable if needed
|
||||||
|
if env_var is not None:
|
||||||
|
os.environ["VLLM_CUSTOM_ALLREDUCE_ALGO"] = env_var
|
||||||
|
else:
|
||||||
|
# Clear the environment variable to avoid interference
|
||||||
|
os.environ.pop("VLLM_CUSTOM_ALLREDUCE_ALGO", None)
|
||||||
|
|
||||||
|
latency = self.benchmark_allreduce_single(
|
||||||
|
sequence_length,
|
||||||
|
allreduce_fn,
|
||||||
|
should_use_fn,
|
||||||
|
context,
|
||||||
|
num_warmup,
|
||||||
|
num_trials,
|
||||||
|
)
|
||||||
|
if latency is not None:
|
||||||
|
results[name] = latency
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
def benchmark_allreduce_single(
|
||||||
|
self,
|
||||||
|
sequence_length: int,
|
||||||
|
allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
|
||||||
|
should_use_fn: Callable[[torch.Tensor], bool],
|
||||||
|
context,
|
||||||
|
num_warmup: int,
|
||||||
|
num_trials: int,
|
||||||
|
) -> Optional[float]:
|
||||||
|
"""Benchmark method with CUDA graph optimization."""
|
||||||
|
try:
|
||||||
|
# Create test tensor (2D: sequence_length x hidden_size)
|
||||||
|
tensor = torch.randn(
|
||||||
|
sequence_length, HIDDEN_SIZE, dtype=BENCHMARK_DTYPE, device=self.device
|
||||||
|
)
|
||||||
|
if not should_use_fn(tensor):
|
||||||
|
return None
|
||||||
|
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
stream = torch.cuda.Stream()
|
||||||
|
with torch.cuda.stream(stream):
|
||||||
|
graph_input = tensor.clone()
|
||||||
|
|
||||||
|
# Warmup before capture
|
||||||
|
for _ in range(3):
|
||||||
|
allreduce_fn(graph_input)
|
||||||
|
|
||||||
|
# Capture the graph using context manager
|
||||||
|
with context:
|
||||||
|
graph = torch.cuda.CUDAGraph()
|
||||||
|
graph_pool = torch.cuda.graph_pool_handle()
|
||||||
|
set_graph_pool_id(graph_pool)
|
||||||
|
with torch.cuda.graph(graph, pool=graph_pool):
|
||||||
|
for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
|
||||||
|
allreduce_fn(graph_input)
|
||||||
|
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
for _ in range(num_warmup):
|
||||||
|
graph.replay()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
start_time = time.perf_counter()
|
||||||
|
|
||||||
|
for _ in range(num_trials):
|
||||||
|
graph.replay()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
end_time = time.perf_counter()
|
||||||
|
|
||||||
|
# Convert to ms and divide by CUDA_GRAPH_CAPTURE_CYCLES
|
||||||
|
return (
|
||||||
|
(end_time - start_time) / num_trials / CUDA_GRAPH_CAPTURE_CYCLES * 1000
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("CUDA graph benchmark failed: %s", e)
|
||||||
|
raise RuntimeError(
|
||||||
|
f"CUDA graph benchmark failed for communicator: {e}"
|
||||||
|
) from e
|
||||||
|
|
||||||
|
|
||||||
|
def _calculate_speedup_info(comm_results: dict[str, float]) -> str:
|
||||||
|
"""Calculate speedup information for a single tensor size."""
|
||||||
|
if not comm_results:
|
||||||
|
return "N/A"
|
||||||
|
|
||||||
|
# Find the fastest communicator
|
||||||
|
fastest_comm = min(comm_results.keys(), key=lambda k: comm_results[k])
|
||||||
|
fastest_time = comm_results[fastest_comm]
|
||||||
|
|
||||||
|
# Calculate speedup vs PyNccl if available
|
||||||
|
if "pynccl" in comm_results:
|
||||||
|
pynccl_time = comm_results["pynccl"]
|
||||||
|
speedup = pynccl_time / fastest_time
|
||||||
|
return f"{fastest_comm} ({speedup:.2f}x)"
|
||||||
|
else:
|
||||||
|
return f"{fastest_comm} (N/A)"
|
||||||
|
|
||||||
|
|
||||||
|
def print_results(
|
||||||
|
results: dict[str, dict[str, float]], sequence_lengths: list[int], world_size: int
|
||||||
|
):
|
||||||
|
"""Print benchmark results in a formatted table."""
|
||||||
|
|
||||||
|
print(f"\n{'=' * 130}")
|
||||||
|
print("Device Communicator Benchmark Results")
|
||||||
|
print(
|
||||||
|
f"World Size: {world_size}, Data Type: {BENCHMARK_DTYPE}, "
|
||||||
|
f"Hidden Size: {HIDDEN_SIZE}"
|
||||||
|
)
|
||||||
|
print(f"{'=' * 130}")
|
||||||
|
|
||||||
|
# Get all communicator names
|
||||||
|
all_comms = set()
|
||||||
|
for size_results in results.values():
|
||||||
|
all_comms.update(size_results.keys())
|
||||||
|
|
||||||
|
all_comms = sorted(list(all_comms))
|
||||||
|
|
||||||
|
# Print header
|
||||||
|
header = f"{'Tensor Shape':<20}{'Tensor Size':<15}"
|
||||||
|
for comm in all_comms:
|
||||||
|
header += f"{comm:<20}"
|
||||||
|
header += f"{'Best (Speedup vs PyNccl)':<30}"
|
||||||
|
print(header)
|
||||||
|
print("-" * len(header))
|
||||||
|
|
||||||
|
# Print results for each sequence length
|
||||||
|
for seq_len in sequence_lengths:
|
||||||
|
if seq_len in results:
|
||||||
|
# Calculate tensor size in elements and bytes
|
||||||
|
tensor_elements = seq_len * HIDDEN_SIZE
|
||||||
|
tensor_bytes = tensor_elements * BENCHMARK_DTYPE.itemsize
|
||||||
|
|
||||||
|
# Format tensor size (MB)
|
||||||
|
tensor_size_mb = tensor_bytes / (1024 * 1024)
|
||||||
|
tensor_size_str = f"{tensor_size_mb:.2f} MB"
|
||||||
|
|
||||||
|
# Format tensor shape
|
||||||
|
tensor_shape = f"({seq_len}, {HIDDEN_SIZE})"
|
||||||
|
|
||||||
|
row = f"{tensor_shape:<20}{tensor_size_str:<15}"
|
||||||
|
for comm in all_comms:
|
||||||
|
if comm in results[seq_len]:
|
||||||
|
row += f"{results[seq_len][comm]:<20.3f}"
|
||||||
|
else:
|
||||||
|
row += f"{'N/A':<20}"
|
||||||
|
|
||||||
|
# Calculate speedup information
|
||||||
|
speedup_info = _calculate_speedup_info(results[seq_len])
|
||||||
|
row += f"{speedup_info:<30}"
|
||||||
|
|
||||||
|
print(row)
|
||||||
|
|
||||||
|
print(f"{'=' * 130}")
|
||||||
|
print("All times are in milliseconds (ms) per allreduce operation")
|
||||||
|
print("Speedup column shows: fastest_algorithm (speedup_vs_pynccl)")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = FlexibleArgumentParser(description="Benchmark device communicators")
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--sequence-lengths",
|
||||||
|
type=int,
|
||||||
|
nargs="+",
|
||||||
|
default=DEFAULT_SEQUENCE_LENGTHS,
|
||||||
|
help="Sequence lengths to benchmark (tensor shape: seq_len x hidden_size)",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--num-warmup", type=int, default=5, help="Number of warmup iterations"
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--num-trials", type=int, default=50, help="Number of benchmark trials"
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument("--output-json", type=str, help="Output results to JSON file")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Initialize distributed
|
||||||
|
if not dist.is_initialized():
|
||||||
|
dist.init_process_group(backend="gloo")
|
||||||
|
rank = dist.get_rank()
|
||||||
|
world_size = dist.get_world_size()
|
||||||
|
|
||||||
|
# Set device
|
||||||
|
device = torch.device(f"cuda:{rank}")
|
||||||
|
torch.cuda.set_device(device)
|
||||||
|
|
||||||
|
# Get CPU process group
|
||||||
|
cpu_group = dist.new_group(backend="gloo")
|
||||||
|
|
||||||
|
# Disable USE_SYMM_MEM to avoid affecting the max_sizes
|
||||||
|
# in symm_mem and custom_all_reduce for benchmark
|
||||||
|
os.environ["VLLM_ALLREDUCE_USE_SYMM_MEM"] = "0"
|
||||||
|
|
||||||
|
# Initialize benchmark
|
||||||
|
benchmark = CommunicatorBenchmark(
|
||||||
|
rank, world_size, device, cpu_group, args.sequence_lengths
|
||||||
|
)
|
||||||
|
|
||||||
|
# Run benchmarks
|
||||||
|
all_results = {}
|
||||||
|
|
||||||
|
for seq_len in args.sequence_lengths:
|
||||||
|
if rank == 0:
|
||||||
|
logger.info(
|
||||||
|
"Benchmarking sequence length: %s (tensor shape: %s x %s)",
|
||||||
|
seq_len,
|
||||||
|
seq_len,
|
||||||
|
HIDDEN_SIZE,
|
||||||
|
)
|
||||||
|
|
||||||
|
results = benchmark.benchmark_allreduce(
|
||||||
|
sequence_length=seq_len,
|
||||||
|
num_warmup=args.num_warmup,
|
||||||
|
num_trials=args.num_trials,
|
||||||
|
)
|
||||||
|
|
||||||
|
all_results[seq_len] = results
|
||||||
|
|
||||||
|
# Synchronize between ranks
|
||||||
|
dist.barrier()
|
||||||
|
|
||||||
|
# Print results (only rank 0)
|
||||||
|
if rank == 0:
|
||||||
|
print_results(all_results, args.sequence_lengths, world_size)
|
||||||
|
|
||||||
|
# Save to JSON if requested
|
||||||
|
if args.output_json:
|
||||||
|
# Add speedup information to results
|
||||||
|
enhanced_results = {}
|
||||||
|
for seq_len, comm_results in all_results.items():
|
||||||
|
enhanced_results[seq_len] = {
|
||||||
|
"timings": comm_results,
|
||||||
|
"speedup_info": _calculate_speedup_info(comm_results),
|
||||||
|
}
|
||||||
|
|
||||||
|
output_data = {
|
||||||
|
"world_size": world_size,
|
||||||
|
"dtype": str(BENCHMARK_DTYPE),
|
||||||
|
"hidden_size": HIDDEN_SIZE,
|
||||||
|
"sequence_lengths": args.sequence_lengths,
|
||||||
|
"num_warmup": args.num_warmup,
|
||||||
|
"num_trials": args.num_trials,
|
||||||
|
"cuda_graph_capture_cycles": CUDA_GRAPH_CAPTURE_CYCLES,
|
||||||
|
"results": enhanced_results,
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(args.output_json, "w") as f:
|
||||||
|
json.dump(output_data, f, indent=2)
|
||||||
|
|
||||||
|
logger.info("Results saved to %s", args.output_json)
|
||||||
|
|
||||||
|
# Cleanup
|
||||||
|
if cpu_group != dist.group.WORLD:
|
||||||
|
dist.destroy_process_group(cpu_group)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@ -7,6 +7,7 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE
|
|||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||||
|
from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
|
||||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
|
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import (
|
from vllm.model_executor.layers.fused_moe.fused_moe import (
|
||||||
fused_experts,
|
fused_experts,
|
||||||
@ -80,6 +81,11 @@ def bench_run(
|
|||||||
a, score, topk, renormalize=False
|
a, score, topk, renormalize=False
|
||||||
)
|
)
|
||||||
|
|
||||||
|
ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
|
||||||
|
ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
|
||||||
|
c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
|
||||||
|
c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
|
||||||
|
|
||||||
def run_triton_moe(
|
def run_triton_moe(
|
||||||
a: torch.Tensor,
|
a: torch.Tensor,
|
||||||
w1: torch.Tensor,
|
w1: torch.Tensor,
|
||||||
@ -91,6 +97,11 @@ def bench_run(
|
|||||||
a_scale: torch.Tensor,
|
a_scale: torch.Tensor,
|
||||||
num_repeats: int,
|
num_repeats: int,
|
||||||
):
|
):
|
||||||
|
quant_config = fp8_w8a8_moe_quant_config(
|
||||||
|
w1_scale=w1_scale,
|
||||||
|
w2_scale=w2_scale,
|
||||||
|
a1_scale=a_scale,
|
||||||
|
)
|
||||||
for _ in range(num_repeats):
|
for _ in range(num_repeats):
|
||||||
fused_experts(
|
fused_experts(
|
||||||
a,
|
a,
|
||||||
@ -98,10 +109,7 @@ def bench_run(
|
|||||||
w2,
|
w2,
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
use_fp8_w8a8=True,
|
quant_config=quant_config,
|
||||||
w1_scale=w1_scale,
|
|
||||||
w2_scale=w2_scale,
|
|
||||||
a1_scale=a_scale,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def run_cutlass_moe(
|
def run_cutlass_moe(
|
||||||
@ -111,11 +119,21 @@ def bench_run(
|
|||||||
w2: torch.Tensor,
|
w2: torch.Tensor,
|
||||||
w1_scale: torch.Tensor,
|
w1_scale: torch.Tensor,
|
||||||
w2_scale: torch.Tensor,
|
w2_scale: torch.Tensor,
|
||||||
|
ab_strides1: torch.Tensor,
|
||||||
|
ab_strides2: torch.Tensor,
|
||||||
|
c_strides1: torch.Tensor,
|
||||||
|
c_strides2: torch.Tensor,
|
||||||
topk_weights: torch.Tensor,
|
topk_weights: torch.Tensor,
|
||||||
topk_ids: torch.Tensor,
|
topk_ids: torch.Tensor,
|
||||||
per_act_token: bool,
|
per_act_token: bool,
|
||||||
num_repeats: int,
|
num_repeats: int,
|
||||||
):
|
):
|
||||||
|
quant_config = fp8_w8a8_moe_quant_config(
|
||||||
|
w1_scale=w1_scale,
|
||||||
|
w2_scale=w2_scale,
|
||||||
|
per_act_token_quant=per_act_token,
|
||||||
|
)
|
||||||
|
|
||||||
for _ in range(num_repeats):
|
for _ in range(num_repeats):
|
||||||
cutlass_moe_fp8(
|
cutlass_moe_fp8(
|
||||||
a,
|
a,
|
||||||
@ -123,10 +141,11 @@ def bench_run(
|
|||||||
w2,
|
w2,
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
w1_scale,
|
ab_strides1,
|
||||||
w2_scale,
|
ab_strides2,
|
||||||
per_act_token,
|
c_strides1,
|
||||||
a1_scale=None,
|
c_strides2,
|
||||||
|
quant_config=quant_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
def run_cutlass_from_graph(
|
def run_cutlass_from_graph(
|
||||||
@ -136,9 +155,19 @@ def bench_run(
|
|||||||
w2_q: torch.Tensor,
|
w2_q: torch.Tensor,
|
||||||
w1_scale: torch.Tensor,
|
w1_scale: torch.Tensor,
|
||||||
w2_scale: torch.Tensor,
|
w2_scale: torch.Tensor,
|
||||||
|
ab_strides1: torch.Tensor,
|
||||||
|
ab_strides2: torch.Tensor,
|
||||||
|
c_strides1: torch.Tensor,
|
||||||
|
c_strides2: torch.Tensor,
|
||||||
topk_weights: torch.Tensor,
|
topk_weights: torch.Tensor,
|
||||||
topk_ids: torch.Tensor,
|
topk_ids: torch.Tensor,
|
||||||
):
|
):
|
||||||
|
quant_config = fp8_w8a8_moe_quant_config(
|
||||||
|
w1_scale=w1_scale,
|
||||||
|
w2_scale=w2_scale,
|
||||||
|
per_act_token_quant=per_act_token,
|
||||||
|
)
|
||||||
|
|
||||||
with set_current_vllm_config(
|
with set_current_vllm_config(
|
||||||
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
||||||
):
|
):
|
||||||
@ -148,10 +177,11 @@ def bench_run(
|
|||||||
w2_q,
|
w2_q,
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
w1_scale,
|
ab_strides1,
|
||||||
w2_scale,
|
ab_strides2,
|
||||||
per_act_token,
|
c_strides1,
|
||||||
a1_scale=None,
|
c_strides2,
|
||||||
|
quant_config=quant_config,
|
||||||
)
|
)
|
||||||
|
|
||||||
def run_triton_from_graph(
|
def run_triton_from_graph(
|
||||||
@ -164,6 +194,11 @@ def bench_run(
|
|||||||
w2_scale: torch.Tensor,
|
w2_scale: torch.Tensor,
|
||||||
a_scale: torch.Tensor,
|
a_scale: torch.Tensor,
|
||||||
):
|
):
|
||||||
|
quant_config = fp8_w8a8_moe_quant_config(
|
||||||
|
w1_scale=w1_scale,
|
||||||
|
w2_scale=w2_scale,
|
||||||
|
a1_scale=a_scale,
|
||||||
|
)
|
||||||
with set_current_vllm_config(
|
with set_current_vllm_config(
|
||||||
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
||||||
):
|
):
|
||||||
@ -173,10 +208,7 @@ def bench_run(
|
|||||||
w2,
|
w2,
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
use_fp8_w8a8=True,
|
quant_config=quant_config,
|
||||||
w1_scale=w1_scale,
|
|
||||||
w2_scale=w2_scale,
|
|
||||||
a1_scale=a_scale,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def replay_graph(graph, num_repeats):
|
def replay_graph(graph, num_repeats):
|
||||||
@ -194,6 +226,10 @@ def bench_run(
|
|||||||
w2_q,
|
w2_q,
|
||||||
w1_scale,
|
w1_scale,
|
||||||
w2_scale,
|
w2_scale,
|
||||||
|
ab_strides1,
|
||||||
|
ab_strides2,
|
||||||
|
c_strides1,
|
||||||
|
c_strides2,
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
)
|
)
|
||||||
@ -231,6 +267,10 @@ def bench_run(
|
|||||||
"w1_scale": w1_scale,
|
"w1_scale": w1_scale,
|
||||||
"w2_scale": w2_scale,
|
"w2_scale": w2_scale,
|
||||||
"per_act_token": per_act_token,
|
"per_act_token": per_act_token,
|
||||||
|
"ab_strides1": ab_strides1,
|
||||||
|
"ab_strides2": ab_strides2,
|
||||||
|
"c_strides1": c_strides1,
|
||||||
|
"c_strides2": c_strides2,
|
||||||
# cuda graph params
|
# cuda graph params
|
||||||
"cutlass_graph": cutlass_graph,
|
"cutlass_graph": cutlass_graph,
|
||||||
"triton_graph": triton_graph,
|
"triton_graph": triton_graph,
|
||||||
@ -289,6 +329,10 @@ def bench_run(
|
|||||||
w2_q,
|
w2_q,
|
||||||
w1_scale,
|
w1_scale,
|
||||||
w2_scale,
|
w2_scale,
|
||||||
|
ab_strides1,
|
||||||
|
ab_strides2,
|
||||||
|
c_strides1,
|
||||||
|
c_strides2,
|
||||||
topk_weights,
|
topk_weights,
|
||||||
topk_ids,
|
topk_ids,
|
||||||
per_act_token,
|
per_act_token,
|
||||||
@ -297,7 +341,7 @@ def bench_run(
|
|||||||
|
|
||||||
results.append(
|
results.append(
|
||||||
benchmark.Timer(
|
benchmark.Timer(
|
||||||
stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, per_act_token, num_runs)", # noqa: E501
|
stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, ab_strides1, ab_strides2, c_strides1, c_strides2, topk_weights, topk_ids, per_act_token, num_runs)", # noqa: E501
|
||||||
globals=globals,
|
globals=globals,
|
||||||
label=label,
|
label=label,
|
||||||
sub_label=sub_label,
|
sub_label=sub_label,
|
||||||
|
|||||||
@ -79,9 +79,9 @@ def make_rand_lora_weight_tensor(
|
|||||||
|
|
||||||
|
|
||||||
def make_rand_tensors(
|
def make_rand_tensors(
|
||||||
a_shape: tuple[int],
|
a_shape: tuple[int, ...],
|
||||||
b_shape: tuple[int],
|
b_shape: tuple[int, ...],
|
||||||
c_shape: tuple[int],
|
c_shape: tuple[int, ...],
|
||||||
a_dtype: torch.dtype,
|
a_dtype: torch.dtype,
|
||||||
b_dtype: torch.dtype,
|
b_dtype: torch.dtype,
|
||||||
c_dtype: torch.dtype,
|
c_dtype: torch.dtype,
|
||||||
@ -243,7 +243,7 @@ class OpType(Enum):
|
|||||||
lora_rank: int,
|
lora_rank: int,
|
||||||
num_loras: int,
|
num_loras: int,
|
||||||
num_slices: int,
|
num_slices: int,
|
||||||
) -> tuple[tuple[int], tuple[int], tuple[int]]:
|
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
|
||||||
"""
|
"""
|
||||||
Given num_slices, return the shapes of the A, B, and C matrices
|
Given num_slices, return the shapes of the A, B, and C matrices
|
||||||
in A x B = C, for the op_type
|
in A x B = C, for the op_type
|
||||||
@ -464,7 +464,11 @@ class BenchmarkTensors:
|
|||||||
for field_name in LoRAKernelMeta.__dataclass_fields__:
|
for field_name in LoRAKernelMeta.__dataclass_fields__:
|
||||||
field = getattr(self.lora_kernel_meta, field_name)
|
field = getattr(self.lora_kernel_meta, field_name)
|
||||||
assert isinstance(field, torch.Tensor)
|
assert isinstance(field, torch.Tensor)
|
||||||
setattr(self.lora_kernel_meta, field_name, to_device(field))
|
setattr(
|
||||||
|
self.lora_kernel_meta,
|
||||||
|
field_name,
|
||||||
|
to_device(field) if field_name != "no_lora_flag_cpu" else field,
|
||||||
|
)
|
||||||
|
|
||||||
def metadata(self) -> tuple[int, int, int]:
|
def metadata(self) -> tuple[int, int, int]:
|
||||||
"""
|
"""
|
||||||
@ -512,6 +516,7 @@ class BenchmarkTensors:
|
|||||||
"lora_token_start_loc": self.lora_kernel_meta.lora_token_start_loc,
|
"lora_token_start_loc": self.lora_kernel_meta.lora_token_start_loc,
|
||||||
"lora_ids": self.lora_kernel_meta.active_lora_ids,
|
"lora_ids": self.lora_kernel_meta.active_lora_ids,
|
||||||
"scaling": 1.0,
|
"scaling": 1.0,
|
||||||
|
"no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
|
||||||
}
|
}
|
||||||
|
|
||||||
def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
|
def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
|
||||||
@ -552,6 +557,7 @@ class BenchmarkTensors:
|
|||||||
"lora_ids": self.lora_kernel_meta.active_lora_ids,
|
"lora_ids": self.lora_kernel_meta.active_lora_ids,
|
||||||
"offset_start": 0,
|
"offset_start": 0,
|
||||||
"add_inputs": add_inputs,
|
"add_inputs": add_inputs,
|
||||||
|
"no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
|
||||||
}
|
}
|
||||||
|
|
||||||
def bench_fn_kwargs(
|
def bench_fn_kwargs(
|
||||||
@ -637,7 +643,7 @@ def bench_optype(
|
|||||||
# Clear LoRA optimization hash-maps.
|
# Clear LoRA optimization hash-maps.
|
||||||
_LORA_A_PTR_DICT.clear()
|
_LORA_A_PTR_DICT.clear()
|
||||||
_LORA_B_PTR_DICT.clear()
|
_LORA_B_PTR_DICT.clear()
|
||||||
# Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are setup
|
# Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
|
||||||
for kwargs in kwargs_list:
|
for kwargs in kwargs_list:
|
||||||
op_type.bench_fn()(**kwargs)
|
op_type.bench_fn()(**kwargs)
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
|
|||||||
@ -253,28 +253,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
|
|||||||
else:
|
else:
|
||||||
assert bt.a.dtype == torch.int8
|
assert bt.a.dtype == torch.int8
|
||||||
assert bt.wtype == scalar_types.uint4b8
|
assert bt.wtype == scalar_types.uint4b8
|
||||||
|
raise NotImplementedError("QQQ is not supported anymore")
|
||||||
if bt.w_ch_s is not None:
|
|
||||||
s_ch = bt.w_ch_s.to(torch.float32)
|
|
||||||
else:
|
|
||||||
s_ch = torch.ones(bt.w_ref.shape[1], dtype=torch.float32, device=device)
|
|
||||||
|
|
||||||
if bt.w_tok_s is not None:
|
|
||||||
s_tok = bt.w_tok_s.to(torch.float32)
|
|
||||||
else:
|
|
||||||
s_tok = torch.ones(bt.a.shape[0], dtype=torch.float32, device=device)
|
|
||||||
|
|
||||||
fn = lambda: ops.marlin_qqq_gemm(
|
|
||||||
a=bt.a,
|
|
||||||
b_q_weight=w_q,
|
|
||||||
s_group=w_s,
|
|
||||||
s_tok=s_tok,
|
|
||||||
s_ch=s_ch,
|
|
||||||
workspace=workspace.scratch,
|
|
||||||
size_m=bt.a.shape[0],
|
|
||||||
size_n=bt.w_ref.shape[1],
|
|
||||||
size_k=bt.w_ref.shape[0],
|
|
||||||
)
|
|
||||||
|
|
||||||
return fn
|
return fn
|
||||||
|
|
||||||
@ -305,6 +284,25 @@ def machete_create_bench_fn(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def cutlass_w4a8_create_bench_fn(
|
||||||
|
bt: BenchmarkTensors, out_type=torch.dtype, schedule=None
|
||||||
|
) -> Callable:
|
||||||
|
w_q = bt.w_q.t().contiguous().t() # make col major
|
||||||
|
w_q = ops.cutlass_encode_and_reorder_int4b(w_q)
|
||||||
|
# expects fp8 scales
|
||||||
|
w_s = ops.cutlass_pack_scale_fp8(bt.w_g_s.to(torch.float8_e4m3fn))
|
||||||
|
|
||||||
|
return lambda: ops.cutlass_w4a8_mm(
|
||||||
|
a=bt.a,
|
||||||
|
b_q=w_q,
|
||||||
|
b_group_scales=w_s,
|
||||||
|
b_group_size=bt.group_size,
|
||||||
|
b_channel_scales=bt.w_ch_s,
|
||||||
|
a_token_scales=bt.w_tok_s,
|
||||||
|
maybe_schedule=schedule,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# impl
|
# impl
|
||||||
|
|
||||||
# bench
|
# bench
|
||||||
@ -406,6 +404,20 @@ def bench(
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# cutlass w4a8
|
||||||
|
if types.act_type == torch.float8_e4m3fn and group_size == 128:
|
||||||
|
timers.append(
|
||||||
|
bench_fns(
|
||||||
|
label,
|
||||||
|
sub_label,
|
||||||
|
f"cutlass w4a8 ({name_type_string})",
|
||||||
|
[
|
||||||
|
cutlass_w4a8_create_bench_fn(bt, out_type=types.output_type)
|
||||||
|
for bt in benchmark_tensors
|
||||||
|
],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
if sweep_schedules:
|
if sweep_schedules:
|
||||||
global _SWEEP_SCHEDULES_RESULTS
|
global _SWEEP_SCHEDULES_RESULTS
|
||||||
|
|
||||||
|
|||||||
@ -14,6 +14,10 @@ import ray
|
|||||||
import torch
|
import torch
|
||||||
from ray.experimental.tqdm_ray import tqdm
|
from ray.experimental.tqdm_ray import tqdm
|
||||||
|
|
||||||
|
from vllm.model_executor.layers.fused_moe.config import (
|
||||||
|
FusedMoEQuantConfig,
|
||||||
|
_get_config_dtype_str,
|
||||||
|
)
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.transformers_utils.config import get_config
|
from vllm.transformers_utils.config import get_config
|
||||||
@ -134,43 +138,36 @@ def benchmark_config(
|
|||||||
def run():
|
def run():
|
||||||
from vllm.model_executor.layers.fused_moe import override_config
|
from vllm.model_executor.layers.fused_moe import override_config
|
||||||
|
|
||||||
|
if use_fp8_w8a8:
|
||||||
|
quant_dtype = torch.float8_e4m3fn
|
||||||
|
elif use_int8_w8a16:
|
||||||
|
quant_dtype = torch.int8
|
||||||
|
else:
|
||||||
|
quant_dtype = None
|
||||||
|
|
||||||
|
quant_config = FusedMoEQuantConfig.make(
|
||||||
|
quant_dtype=quant_dtype,
|
||||||
|
w1_scale=w1_scale,
|
||||||
|
w2_scale=w2_scale,
|
||||||
|
a1_scale=a1_scale,
|
||||||
|
a2_scale=a2_scale,
|
||||||
|
block_shape=block_quant_shape,
|
||||||
|
)
|
||||||
|
|
||||||
with override_config(config):
|
with override_config(config):
|
||||||
if use_deep_gemm:
|
topk_weights, topk_ids, token_expert_indices = fused_topk(
|
||||||
topk_weights, topk_ids, token_expert_indices = fused_topk(
|
x, input_gating, topk, renormalize=not use_deep_gemm
|
||||||
x, input_gating, topk, False
|
)
|
||||||
)
|
return fused_experts(
|
||||||
return fused_experts(
|
x,
|
||||||
x,
|
w1,
|
||||||
w1,
|
w2,
|
||||||
w2,
|
topk_weights,
|
||||||
topk_weights,
|
topk_ids,
|
||||||
topk_ids,
|
inplace=True,
|
||||||
inplace=True,
|
quant_config=quant_config,
|
||||||
use_fp8_w8a8=use_fp8_w8a8,
|
allow_deep_gemm=use_deep_gemm,
|
||||||
w1_scale=w1_scale,
|
)
|
||||||
w2_scale=w2_scale,
|
|
||||||
a1_scale=a1_scale,
|
|
||||||
a2_scale=a2_scale,
|
|
||||||
block_shape=block_quant_shape,
|
|
||||||
allow_deep_gemm=True,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
fused_moe(
|
|
||||||
x,
|
|
||||||
w1,
|
|
||||||
w2,
|
|
||||||
input_gating,
|
|
||||||
topk,
|
|
||||||
renormalize=True,
|
|
||||||
inplace=True,
|
|
||||||
use_fp8_w8a8=use_fp8_w8a8,
|
|
||||||
use_int8_w8a16=use_int8_w8a16,
|
|
||||||
w1_scale=w1_scale,
|
|
||||||
w2_scale=w2_scale,
|
|
||||||
a1_scale=a1_scale,
|
|
||||||
a2_scale=a2_scale,
|
|
||||||
block_shape=block_quant_shape,
|
|
||||||
)
|
|
||||||
|
|
||||||
# JIT compilation & warmup
|
# JIT compilation & warmup
|
||||||
run()
|
run()
|
||||||
@ -414,13 +411,15 @@ class BenchmarkWorker:
|
|||||||
use_deep_gemm: bool = False,
|
use_deep_gemm: bool = False,
|
||||||
) -> tuple[dict[str, int], float]:
|
) -> tuple[dict[str, int], float]:
|
||||||
current_platform.seed_everything(self.seed)
|
current_platform.seed_everything(self.seed)
|
||||||
dtype_str = get_config_dtype_str(
|
dtype_str = _get_config_dtype_str(
|
||||||
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
|
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
|
||||||
)
|
)
|
||||||
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
|
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
|
||||||
# is the intermediate size after silu_and_mul.
|
# is the intermediate size after silu_and_mul.
|
||||||
|
block_n = block_quant_shape[0] if block_quant_shape else None
|
||||||
|
block_k = block_quant_shape[1] if block_quant_shape else None
|
||||||
op_config = get_moe_configs(
|
op_config = get_moe_configs(
|
||||||
num_experts, shard_intermediate_size // 2, dtype_str
|
num_experts, shard_intermediate_size // 2, dtype_str, block_n, block_k
|
||||||
)
|
)
|
||||||
if op_config is None:
|
if op_config is None:
|
||||||
config = get_default_config(
|
config = get_default_config(
|
||||||
@ -430,7 +429,7 @@ class BenchmarkWorker:
|
|||||||
hidden_size,
|
hidden_size,
|
||||||
topk,
|
topk,
|
||||||
dtype_str,
|
dtype_str,
|
||||||
is_marlin=False,
|
block_quant_shape,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
config = op_config[min(op_config.keys(), key=lambda x: abs(x - num_tokens))]
|
config = op_config[min(op_config.keys(), key=lambda x: abs(x - num_tokens))]
|
||||||
@ -545,7 +544,7 @@ def save_configs(
|
|||||||
block_quant_shape: list[int],
|
block_quant_shape: list[int],
|
||||||
save_dir: str,
|
save_dir: str,
|
||||||
) -> None:
|
) -> None:
|
||||||
dtype_str = get_config_dtype_str(
|
dtype_str = _get_config_dtype_str(
|
||||||
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
|
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -558,7 +557,7 @@ def save_configs(
|
|||||||
filename = os.path.join(save_dir, filename)
|
filename = os.path.join(save_dir, filename)
|
||||||
print(f"Writing best config to {filename}...")
|
print(f"Writing best config to {filename}...")
|
||||||
with open(filename, "w") as f:
|
with open(filename, "w") as f:
|
||||||
json.dump(configs, f, indent=4)
|
json.dump({"triton_version": triton.__version__, **configs}, f, indent=4)
|
||||||
f.write("\n")
|
f.write("\n")
|
||||||
|
|
||||||
|
|
||||||
@ -585,14 +584,19 @@ def main(args: argparse.Namespace):
|
|||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
intermediate_size = config.intermediate_size
|
intermediate_size = config.intermediate_size
|
||||||
elif config.architectures[0] in (
|
elif config.architectures[0] in (
|
||||||
"DeepseekV3ForCausalLM",
|
|
||||||
"DeepseekV2ForCausalLM",
|
"DeepseekV2ForCausalLM",
|
||||||
|
"DeepseekV3ForCausalLM",
|
||||||
|
"DeepseekV32ForCausalLM",
|
||||||
"Glm4MoeForCausalLM",
|
"Glm4MoeForCausalLM",
|
||||||
):
|
):
|
||||||
E = config.n_routed_experts
|
E = config.n_routed_experts
|
||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
intermediate_size = config.moe_intermediate_size
|
intermediate_size = config.moe_intermediate_size
|
||||||
elif config.architectures[0] in ("Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"):
|
elif config.architectures[0] in (
|
||||||
|
"Qwen2MoeForCausalLM",
|
||||||
|
"Qwen3MoeForCausalLM",
|
||||||
|
"Qwen3NextForCausalLM",
|
||||||
|
):
|
||||||
E = config.num_experts
|
E = config.num_experts
|
||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
intermediate_size = config.moe_intermediate_size
|
intermediate_size = config.moe_intermediate_size
|
||||||
@ -676,7 +680,11 @@ def main(args: argparse.Namespace):
|
|||||||
is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
|
is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
|
||||||
search_space = get_configs_compute_bound(is_fp16, block_quant_shape)
|
search_space = get_configs_compute_bound(is_fp16, block_quant_shape)
|
||||||
print(f"Start tuning over {len(search_space)} configurations...")
|
print(f"Start tuning over {len(search_space)} configurations...")
|
||||||
|
if use_deep_gemm:
|
||||||
|
raise ValueError(
|
||||||
|
"Tuning with --use-deep-gemm is not supported as it only tunes Triton "
|
||||||
|
"kernels. Please remove the flag."
|
||||||
|
)
|
||||||
start = time.time()
|
start = time.time()
|
||||||
configs = _distribute(
|
configs = _distribute(
|
||||||
"tune",
|
"tune",
|
||||||
|
|||||||
155
benchmarks/kernels/benchmark_polynorm.py
Normal file
155
benchmarks/kernels/benchmark_polynorm.py
Normal file
@ -0,0 +1,155 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm import _custom_ops as vllm_ops
|
||||||
|
from vllm.triton_utils import triton
|
||||||
|
|
||||||
|
|
||||||
|
def polynorm_naive(
|
||||||
|
x: torch.Tensor,
|
||||||
|
weight: torch.Tensor,
|
||||||
|
bias: torch.Tensor,
|
||||||
|
eps: float = 1e-6,
|
||||||
|
):
|
||||||
|
orig_shape = x.shape
|
||||||
|
x = x.view(-1, x.shape[-1])
|
||||||
|
|
||||||
|
def norm(x, eps: float):
|
||||||
|
return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
|
||||||
|
|
||||||
|
x = x.float()
|
||||||
|
return (
|
||||||
|
(
|
||||||
|
weight[0] * norm(x**3, eps)
|
||||||
|
+ weight[1] * norm(x**2, eps)
|
||||||
|
+ weight[2] * norm(x, eps)
|
||||||
|
+ bias
|
||||||
|
)
|
||||||
|
.to(weight.dtype)
|
||||||
|
.view(orig_shape)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def polynorm_vllm(
|
||||||
|
x: torch.Tensor,
|
||||||
|
weight: torch.Tensor,
|
||||||
|
bias: torch.Tensor,
|
||||||
|
eps: float = 1e-6,
|
||||||
|
):
|
||||||
|
orig_shape = x.shape
|
||||||
|
x = x.view(-1, x.shape[-1])
|
||||||
|
|
||||||
|
out = torch.empty_like(x)
|
||||||
|
vllm_ops.poly_norm(out, x, weight, bias, eps)
|
||||||
|
output = out
|
||||||
|
|
||||||
|
output = output.view(orig_shape)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def calculate_diff(batch_size, seq_len, hidden_dim):
|
||||||
|
dtype = torch.bfloat16
|
||||||
|
x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
|
||||||
|
weight = torch.ones(3, dtype=dtype, device="cuda")
|
||||||
|
bias = torch.ones(1, dtype=dtype, device="cuda")
|
||||||
|
|
||||||
|
output_naive = polynorm_naive(x, weight, bias)
|
||||||
|
output_vllm = polynorm_vllm(x, weight, bias)
|
||||||
|
|
||||||
|
if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
|
||||||
|
print("✅ All implementations match")
|
||||||
|
else:
|
||||||
|
print("❌ Implementations differ")
|
||||||
|
|
||||||
|
|
||||||
|
batch_size_range = [2**i for i in range(0, 7, 2)]
|
||||||
|
seq_length_range = [2**i for i in range(6, 11, 1)]
|
||||||
|
dim_range = [2048, 4096]
|
||||||
|
configs = list(itertools.product(dim_range, batch_size_range, seq_length_range))
|
||||||
|
|
||||||
|
|
||||||
|
def get_benchmark():
|
||||||
|
@triton.testing.perf_report(
|
||||||
|
triton.testing.Benchmark(
|
||||||
|
x_names=["dim", "batch_size", "seq_len"],
|
||||||
|
x_vals=[list(_) for _ in configs],
|
||||||
|
line_arg="provider",
|
||||||
|
line_vals=["naive", "vllm"],
|
||||||
|
line_names=["Naive", "vLLM"],
|
||||||
|
styles=[("blue", "-"), ("red", "-")],
|
||||||
|
ylabel="us",
|
||||||
|
plot_name="polynorm-perf",
|
||||||
|
args={},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
def benchmark(dim, batch_size, seq_len, provider):
|
||||||
|
dtype = torch.bfloat16
|
||||||
|
hidden_dim = dim * 4
|
||||||
|
|
||||||
|
x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
|
||||||
|
weight = torch.ones(3, dtype=dtype, device="cuda")
|
||||||
|
bias = torch.ones(1, dtype=dtype, device="cuda")
|
||||||
|
|
||||||
|
quantiles = [0.5, 0.2, 0.8]
|
||||||
|
|
||||||
|
if provider == "naive":
|
||||||
|
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||||
|
lambda: polynorm_naive(x, weight, bias),
|
||||||
|
quantiles=quantiles,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||||
|
lambda: polynorm_vllm(x, weight, bias),
|
||||||
|
quantiles=quantiles,
|
||||||
|
)
|
||||||
|
|
||||||
|
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
|
||||||
|
|
||||||
|
return benchmark
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"--batch-size",
|
||||||
|
type=int,
|
||||||
|
default=4,
|
||||||
|
help="Batch size",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--seq-len",
|
||||||
|
type=int,
|
||||||
|
default=128,
|
||||||
|
help="Sequence length",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--hidden-dim",
|
||||||
|
type=int,
|
||||||
|
default=8192,
|
||||||
|
help="Intermediate size of MLP",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--save-path",
|
||||||
|
type=str,
|
||||||
|
default="./configs/polnorm/",
|
||||||
|
help="Path to save polnorm benchmark results",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Run correctness test
|
||||||
|
calculate_diff(
|
||||||
|
batch_size=args.batch_size,
|
||||||
|
seq_len=args.seq_len,
|
||||||
|
hidden_dim=args.hidden_dim,
|
||||||
|
)
|
||||||
|
|
||||||
|
benchmark = get_benchmark()
|
||||||
|
# Run performance benchmark
|
||||||
|
benchmark.run(print_data=True, save_path=args.save_path)
|
||||||
174
benchmarks/kernels/benchmark_reshape_and_cache.py
Normal file
174
benchmarks/kernels/benchmark_reshape_and_cache.py
Normal file
@ -0,0 +1,174 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from tabulate import tabulate
|
||||||
|
|
||||||
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm.logger import init_logger
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.utils import (
|
||||||
|
STR_DTYPE_TO_TORCH_DTYPE,
|
||||||
|
FlexibleArgumentParser,
|
||||||
|
create_kv_caches_with_random,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@torch.inference_mode()
|
||||||
|
def run_benchmark(
|
||||||
|
num_tokens: int,
|
||||||
|
num_heads: int,
|
||||||
|
head_size: int,
|
||||||
|
block_size: int,
|
||||||
|
num_blocks: int,
|
||||||
|
dtype: torch.dtype,
|
||||||
|
kv_cache_dtype: str,
|
||||||
|
num_iters: int,
|
||||||
|
benchmark_mode: str,
|
||||||
|
device: str = "cuda",
|
||||||
|
) -> float:
|
||||||
|
"""Return latency (seconds) for given num_tokens."""
|
||||||
|
|
||||||
|
if kv_cache_dtype == "fp8" and head_size % 16:
|
||||||
|
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
|
||||||
|
|
||||||
|
current_platform.seed_everything(42)
|
||||||
|
torch.set_default_device(device)
|
||||||
|
|
||||||
|
# create random key / value tensors [T, H, D].
|
||||||
|
key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device)
|
||||||
|
value = torch.randn_like(key)
|
||||||
|
|
||||||
|
# prepare the slot mapping.
|
||||||
|
# each token is assigned a unique slot in the KV-cache.
|
||||||
|
num_slots = block_size * num_blocks
|
||||||
|
if num_tokens > num_slots:
|
||||||
|
raise ValueError("num_tokens cannot exceed the total number of cache slots")
|
||||||
|
slot_mapping_lst = random.sample(range(num_slots), num_tokens)
|
||||||
|
slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
|
||||||
|
|
||||||
|
key_caches, value_caches = create_kv_caches_with_random(
|
||||||
|
num_blocks,
|
||||||
|
block_size,
|
||||||
|
1, # num_layers
|
||||||
|
num_heads,
|
||||||
|
head_size,
|
||||||
|
kv_cache_dtype,
|
||||||
|
dtype,
|
||||||
|
device=device,
|
||||||
|
)
|
||||||
|
key_cache, value_cache = key_caches[0], value_caches[0]
|
||||||
|
# to free unused memory
|
||||||
|
del key_caches, value_caches
|
||||||
|
|
||||||
|
# compute per-kernel scaling factors for fp8 conversion (if used).
|
||||||
|
k_scale = (key.amax() / 64.0).to(torch.float32)
|
||||||
|
v_scale = (value.amax() / 64.0).to(torch.float32)
|
||||||
|
|
||||||
|
function_under_test = lambda: ops.reshape_and_cache(
|
||||||
|
key, # noqa: F821
|
||||||
|
value, # noqa: F821
|
||||||
|
key_cache, # noqa: F821
|
||||||
|
value_cache, # noqa: F821
|
||||||
|
slot_mapping, # noqa: F821
|
||||||
|
kv_cache_dtype,
|
||||||
|
k_scale,
|
||||||
|
v_scale,
|
||||||
|
)
|
||||||
|
|
||||||
|
if benchmark_mode == "cudagraph":
|
||||||
|
g = torch.cuda.CUDAGraph()
|
||||||
|
with torch.cuda.graph(g):
|
||||||
|
function_under_test()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
function_under_test = lambda: g.replay()
|
||||||
|
|
||||||
|
def run_cuda_benchmark(n_iters: int) -> float:
|
||||||
|
nonlocal key, value, key_cache, value_cache, slot_mapping
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
start = time.perf_counter()
|
||||||
|
for _ in range(n_iters):
|
||||||
|
function_under_test()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
end = time.perf_counter()
|
||||||
|
return (end - start) / n_iters
|
||||||
|
|
||||||
|
# warm-up
|
||||||
|
run_cuda_benchmark(3)
|
||||||
|
|
||||||
|
lat = run_cuda_benchmark(num_iters)
|
||||||
|
|
||||||
|
# free tensors to mitigate OOM when sweeping
|
||||||
|
del key, value, key_cache, value_cache, slot_mapping
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
|
||||||
|
return lat
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
rows = []
|
||||||
|
for exp in range(1, 17):
|
||||||
|
n_tok = 2**exp
|
||||||
|
lat = run_benchmark(
|
||||||
|
num_tokens=n_tok,
|
||||||
|
num_heads=args.num_heads,
|
||||||
|
head_size=args.head_size,
|
||||||
|
block_size=args.block_size,
|
||||||
|
num_blocks=args.num_blocks,
|
||||||
|
dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
|
||||||
|
kv_cache_dtype=args.kv_cache_dtype,
|
||||||
|
num_iters=args.iters,
|
||||||
|
benchmark_mode=args.mode,
|
||||||
|
device="cuda",
|
||||||
|
)
|
||||||
|
rows.append([n_tok, lat * 1e6]) # convert to microseconds
|
||||||
|
|
||||||
|
print(f"Benchmark results for implementation cuda (measuring with {args.mode}):")
|
||||||
|
print(tabulate(rows, headers=["num_tokens", "latency (µs)"], floatfmt=".3f"))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = FlexibleArgumentParser()
|
||||||
|
|
||||||
|
parser.add_argument("--num-heads", type=int, default=128)
|
||||||
|
parser.add_argument(
|
||||||
|
"--head-size",
|
||||||
|
type=int,
|
||||||
|
choices=[64, 80, 96, 112, 120, 128, 192, 256],
|
||||||
|
default=128,
|
||||||
|
)
|
||||||
|
parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
|
||||||
|
parser.add_argument("--num-blocks", type=int, default=128 * 128)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--dtype",
|
||||||
|
type=str,
|
||||||
|
choices=["half", "bfloat16", "float"],
|
||||||
|
default="bfloat16",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--kv-cache-dtype",
|
||||||
|
type=str,
|
||||||
|
choices=["auto", "fp8"],
|
||||||
|
default="auto",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument("--iters", type=int, default=200)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--mode",
|
||||||
|
type=str,
|
||||||
|
choices=["cudagraph", "no_graph"],
|
||||||
|
default="cudagraph",
|
||||||
|
)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
main(args)
|
||||||
@ -9,6 +9,9 @@ import torch
|
|||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm.attention.ops.triton_reshape_and_cache_flash import (
|
||||||
|
triton_reshape_and_cache_flash,
|
||||||
|
)
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import (
|
from vllm.utils import (
|
||||||
@ -31,6 +34,8 @@ def run_benchmark(
|
|||||||
kv_cache_dtype: str,
|
kv_cache_dtype: str,
|
||||||
kv_cache_layout: str,
|
kv_cache_layout: str,
|
||||||
num_iters: int,
|
num_iters: int,
|
||||||
|
implementation: str,
|
||||||
|
benchmark_mode: str,
|
||||||
device: str = "cuda",
|
device: str = "cuda",
|
||||||
) -> float:
|
) -> float:
|
||||||
"""Return latency (seconds) for given num_tokens."""
|
"""Return latency (seconds) for given num_tokens."""
|
||||||
@ -38,6 +43,14 @@ def run_benchmark(
|
|||||||
if kv_cache_dtype == "fp8" and head_size % 16:
|
if kv_cache_dtype == "fp8" and head_size % 16:
|
||||||
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
|
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
|
||||||
|
|
||||||
|
if implementation not in ("cuda", "triton"):
|
||||||
|
raise ValueError(
|
||||||
|
f"Unsupported implementation: {implementation}. "
|
||||||
|
"Only 'cuda' and 'triton' are supported."
|
||||||
|
)
|
||||||
|
if implementation == "triton" and kv_cache_layout == "HND":
|
||||||
|
return float("nan") # Triton does not support HND layout yet.
|
||||||
|
|
||||||
current_platform.seed_everything(42)
|
current_platform.seed_everything(42)
|
||||||
torch.set_default_device(device)
|
torch.set_default_device(device)
|
||||||
|
|
||||||
@ -65,27 +78,49 @@ def run_benchmark(
|
|||||||
cache_layout=kv_cache_layout,
|
cache_layout=kv_cache_layout,
|
||||||
)
|
)
|
||||||
key_cache, value_cache = key_caches[0], value_caches[0]
|
key_cache, value_cache = key_caches[0], value_caches[0]
|
||||||
|
# to free unused memory
|
||||||
|
del key_caches, value_caches
|
||||||
|
|
||||||
# compute per-kernel scaling factors for fp8 conversion (if used).
|
# compute per-kernel scaling factors for fp8 conversion (if used).
|
||||||
k_scale = (key.amax() / 64.0).to(torch.float32)
|
k_scale = (key.amax() / 64.0).to(torch.float32)
|
||||||
v_scale = (value.amax() / 64.0).to(torch.float32)
|
v_scale = (value.amax() / 64.0).to(torch.float32)
|
||||||
|
|
||||||
|
if implementation == "cuda":
|
||||||
|
function_under_test = lambda: ops.reshape_and_cache_flash(
|
||||||
|
key, # noqa: F821
|
||||||
|
value, # noqa: F821
|
||||||
|
key_cache, # noqa: F821
|
||||||
|
value_cache, # noqa: F821
|
||||||
|
slot_mapping, # noqa: F821
|
||||||
|
kv_cache_dtype,
|
||||||
|
k_scale,
|
||||||
|
v_scale,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
function_under_test = lambda: triton_reshape_and_cache_flash(
|
||||||
|
key, # noqa: F821
|
||||||
|
value, # noqa: F821
|
||||||
|
key_cache, # noqa: F821
|
||||||
|
value_cache, # noqa: F821
|
||||||
|
slot_mapping, # noqa: F821
|
||||||
|
kv_cache_dtype,
|
||||||
|
k_scale,
|
||||||
|
v_scale,
|
||||||
|
)
|
||||||
|
if benchmark_mode == "cudagraph":
|
||||||
|
g = torch.cuda.CUDAGraph()
|
||||||
|
with torch.cuda.graph(g):
|
||||||
|
function_under_test()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
function_under_test = lambda: g.replay()
|
||||||
|
|
||||||
def run_cuda_benchmark(n_iters: int) -> float:
|
def run_cuda_benchmark(n_iters: int) -> float:
|
||||||
nonlocal key, value, key_cache, value_cache, slot_mapping
|
nonlocal key, value, key_cache, value_cache, slot_mapping
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
for _ in range(n_iters):
|
for _ in range(n_iters):
|
||||||
ops.reshape_and_cache_flash(
|
function_under_test()
|
||||||
key,
|
torch.cuda.synchronize()
|
||||||
value,
|
|
||||||
key_cache,
|
|
||||||
value_cache,
|
|
||||||
slot_mapping,
|
|
||||||
kv_cache_dtype,
|
|
||||||
k_scale,
|
|
||||||
v_scale,
|
|
||||||
)
|
|
||||||
torch.cuda.synchronize()
|
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
return (end - start) / n_iters
|
return (end - start) / n_iters
|
||||||
|
|
||||||
@ -116,10 +151,16 @@ def main(args):
|
|||||||
kv_cache_dtype=args.kv_cache_dtype,
|
kv_cache_dtype=args.kv_cache_dtype,
|
||||||
kv_cache_layout=layout,
|
kv_cache_layout=layout,
|
||||||
num_iters=args.iters,
|
num_iters=args.iters,
|
||||||
|
implementation=args.implementation,
|
||||||
|
benchmark_mode=args.mode,
|
||||||
device="cuda",
|
device="cuda",
|
||||||
)
|
)
|
||||||
rows.append([n_tok, layout, f"{lat * 1e6:.3f}"])
|
rows.append([n_tok, layout, f"{lat * 1e6:.3f}"])
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"Benchmark results for implementation {args.implementation}"
|
||||||
|
f" (measuring with {args.mode}):"
|
||||||
|
)
|
||||||
print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"]))
|
print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"]))
|
||||||
|
|
||||||
|
|
||||||
@ -151,6 +192,21 @@ if __name__ == "__main__":
|
|||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument("--iters", type=int, default=100)
|
parser.add_argument("--iters", type=int, default=100)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--implementation",
|
||||||
|
type=str,
|
||||||
|
choices=["cuda", "triton"],
|
||||||
|
default="cuda",
|
||||||
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--mode",
|
||||||
|
type=str,
|
||||||
|
choices=["cudagraph", "no_graph"],
|
||||||
|
default="cudagraph",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
675
benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
Normal file
675
benchmarks/kernels/benchmark_silu_mul_fp8_quant.py
Normal file
@ -0,0 +1,675 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
from collections.abc import Callable
|
||||||
|
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
|
||||||
|
silu_mul_fp8_quant_deep_gemm_cuda,
|
||||||
|
)
|
||||||
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.triton_utils import tl, triton
|
||||||
|
from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
|
||||||
|
|
||||||
|
|
||||||
|
@triton.jit
|
||||||
|
def _silu_mul_fp8_quant_deep_gemm(
|
||||||
|
# Pointers ------------------------------------------------------------
|
||||||
|
input_ptr, # 16-bit activations (E, T, 2*H)
|
||||||
|
y_q_ptr, # fp8 quantized activations (E, T, H)
|
||||||
|
y_s_ptr, # 16-bit scales (E, T, G)
|
||||||
|
counts_ptr, # int32 num tokens per expert (E)
|
||||||
|
# Sizes ---------------------------------------------------------------
|
||||||
|
H: tl.constexpr, # hidden dimension (per output)
|
||||||
|
GROUP_SIZE: tl.constexpr, # elements per group (usually 128)
|
||||||
|
# Strides for input (elements) ---------------------------------------
|
||||||
|
stride_i_e,
|
||||||
|
stride_i_t,
|
||||||
|
stride_i_h,
|
||||||
|
# Strides for y_q (elements) -----------------------------------------
|
||||||
|
stride_yq_e,
|
||||||
|
stride_yq_t,
|
||||||
|
stride_yq_h,
|
||||||
|
# Strides for y_s (elements) -----------------------------------------
|
||||||
|
stride_ys_e,
|
||||||
|
stride_ys_t,
|
||||||
|
stride_ys_g,
|
||||||
|
# Stride for counts (elements)
|
||||||
|
stride_counts_e,
|
||||||
|
# Numeric params ------------------------------------------------------
|
||||||
|
eps: tl.constexpr,
|
||||||
|
fp8_min: tl.constexpr,
|
||||||
|
fp8_max: tl.constexpr,
|
||||||
|
use_ue8m0: tl.constexpr,
|
||||||
|
# Meta ---------------------------------------------------------------
|
||||||
|
BLOCK: tl.constexpr,
|
||||||
|
NUM_STAGES: tl.constexpr,
|
||||||
|
):
|
||||||
|
G = H // GROUP_SIZE
|
||||||
|
|
||||||
|
# map program id -> (e, g)
|
||||||
|
pid = tl.program_id(0)
|
||||||
|
e = pid // G
|
||||||
|
g = pid % G
|
||||||
|
|
||||||
|
e = e.to(tl.int64)
|
||||||
|
g = g.to(tl.int64)
|
||||||
|
|
||||||
|
# number of valid tokens for this expert
|
||||||
|
n_tokens = tl.load(counts_ptr + e * stride_counts_e).to(tl.int64)
|
||||||
|
|
||||||
|
cols = tl.arange(0, BLOCK).to(tl.int64)
|
||||||
|
mask = cols < BLOCK
|
||||||
|
|
||||||
|
base_input_offset = e * stride_i_e + g * GROUP_SIZE * stride_i_h
|
||||||
|
base_gate_offset = base_input_offset + cols * stride_i_h
|
||||||
|
base_up_offset = base_input_offset + H * stride_i_h + cols * stride_i_h
|
||||||
|
base_yq_offset = e * stride_yq_e + g * GROUP_SIZE * stride_yq_h + cols * stride_yq_h
|
||||||
|
base_ys_offset = e * stride_ys_e + g * stride_ys_g
|
||||||
|
|
||||||
|
for t in tl.range(0, n_tokens, num_stages=NUM_STAGES):
|
||||||
|
gate = tl.load(
|
||||||
|
input_ptr + base_gate_offset + t * stride_i_t, mask=mask, other=0.0
|
||||||
|
).to(tl.float32)
|
||||||
|
up = tl.load(input_ptr + base_up_offset + t * stride_i_t, mask=mask, other=0.0)
|
||||||
|
|
||||||
|
gate = gate * (1.0 / (1.0 + tl.exp(-gate)))
|
||||||
|
y = gate * up
|
||||||
|
|
||||||
|
y_s = tl.maximum(tl.max(tl.abs(y)), eps) / fp8_max
|
||||||
|
if use_ue8m0:
|
||||||
|
y_s = tl.exp2(tl.ceil(tl.log2(y_s)))
|
||||||
|
|
||||||
|
y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
|
||||||
|
|
||||||
|
tl.store(y_q_ptr + base_yq_offset + t * stride_yq_t, y_q, mask=mask)
|
||||||
|
tl.store(y_s_ptr + base_ys_offset + t * stride_ys_t, y_s)
|
||||||
|
|
||||||
|
|
||||||
|
def silu_mul_fp8_quant_deep_gemm_triton(
|
||||||
|
y: torch.Tensor, # (E, T, 2*H)
|
||||||
|
tokens_per_expert: torch.Tensor, # (E,) number of valid tokens per expert
|
||||||
|
num_parallel_tokens,
|
||||||
|
group_size: int = 128,
|
||||||
|
eps: float = 1e-10,
|
||||||
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
|
"""Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
|
||||||
|
|
||||||
|
y has shape (E, T, 2*H). The first half of the last dimension is
|
||||||
|
silu-activated, multiplied by the second half, then quantized into FP8.
|
||||||
|
|
||||||
|
Returns `(y_q, y_s)` where
|
||||||
|
* `y_q`: FP8 tensor, shape (E, T, H), same layout as y[..., :H]
|
||||||
|
* `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
|
||||||
|
"""
|
||||||
|
assert y.ndim == 3, "y must be (E, T, 2*H)"
|
||||||
|
E, T, H2 = y.shape
|
||||||
|
assert H2 % 2 == 0, "last dim of y must be even (2*H)"
|
||||||
|
H = H2 // 2
|
||||||
|
G = (H + group_size - 1) // group_size
|
||||||
|
assert H % group_size == 0, "H must be divisible by group_size"
|
||||||
|
assert tokens_per_expert.ndim == 1 and tokens_per_expert.shape[0] == E, (
|
||||||
|
"tokens_per_expert must be shape (E,)"
|
||||||
|
)
|
||||||
|
tokens_per_expert = tokens_per_expert.to(device=y.device, dtype=torch.int32)
|
||||||
|
|
||||||
|
# allocate outputs
|
||||||
|
fp8_dtype = torch.float8_e4m3fn
|
||||||
|
y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device)
|
||||||
|
|
||||||
|
# strides (elements)
|
||||||
|
stride_i_e, stride_i_t, stride_i_h = y.stride()
|
||||||
|
stride_yq_e, stride_yq_t, stride_yq_h = y_q.stride()
|
||||||
|
|
||||||
|
# desired scale strides (elements): (T*G, 1, T)
|
||||||
|
stride_ys_e = T * G
|
||||||
|
stride_ys_t = 1
|
||||||
|
stride_ys_g = T
|
||||||
|
y_s = torch.empty_strided(
|
||||||
|
(E, T, G),
|
||||||
|
(stride_ys_e, stride_ys_t, stride_ys_g),
|
||||||
|
dtype=torch.float32,
|
||||||
|
device=y.device,
|
||||||
|
)
|
||||||
|
|
||||||
|
stride_cnt_e = tokens_per_expert.stride()[0]
|
||||||
|
|
||||||
|
# Static grid over experts and H-groups.
|
||||||
|
# A loop inside the kernel handles the token dim
|
||||||
|
grid = (E * G,)
|
||||||
|
|
||||||
|
f_info = torch.finfo(fp8_dtype)
|
||||||
|
fp8_max = f_info.max
|
||||||
|
fp8_min = f_info.min
|
||||||
|
|
||||||
|
_silu_mul_fp8_quant_deep_gemm[grid](
|
||||||
|
y,
|
||||||
|
y_q,
|
||||||
|
y_s,
|
||||||
|
tokens_per_expert,
|
||||||
|
H,
|
||||||
|
group_size,
|
||||||
|
stride_i_e,
|
||||||
|
stride_i_t,
|
||||||
|
stride_i_h,
|
||||||
|
stride_yq_e,
|
||||||
|
stride_yq_t,
|
||||||
|
stride_yq_h,
|
||||||
|
stride_ys_e,
|
||||||
|
stride_ys_t,
|
||||||
|
stride_ys_g,
|
||||||
|
stride_cnt_e,
|
||||||
|
eps,
|
||||||
|
fp8_min,
|
||||||
|
fp8_max,
|
||||||
|
is_deep_gemm_e8m0_used(),
|
||||||
|
BLOCK=group_size,
|
||||||
|
NUM_STAGES=4,
|
||||||
|
num_warps=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
return y_q, y_s
|
||||||
|
|
||||||
|
|
||||||
|
# Parse generation strategies
|
||||||
|
strategies = ["uniform", "max_t", "first_t"]
|
||||||
|
|
||||||
|
|
||||||
|
def benchmark(
|
||||||
|
kernel: Callable,
|
||||||
|
E: int,
|
||||||
|
T: int,
|
||||||
|
H: int,
|
||||||
|
total_tokens: int,
|
||||||
|
num_parallel_tokens: int = 64,
|
||||||
|
G: int = 128,
|
||||||
|
runs: int = 200,
|
||||||
|
num_warmups: int = 20,
|
||||||
|
gen_strategy: str = "default",
|
||||||
|
iterations_per_run: int = 20,
|
||||||
|
):
|
||||||
|
def generate_data(seed_offset=0):
|
||||||
|
"""Generate input data with given seed offset"""
|
||||||
|
current_platform.seed_everything(42 + seed_offset)
|
||||||
|
y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous()
|
||||||
|
|
||||||
|
if gen_strategy == "uniform":
|
||||||
|
r = torch.rand(size=(E,), device="cuda")
|
||||||
|
r /= r.sum()
|
||||||
|
r *= total_tokens
|
||||||
|
tokens_per_expert = r.int()
|
||||||
|
tokens_per_expert = torch.minimum(
|
||||||
|
tokens_per_expert,
|
||||||
|
torch.ones((E,), device=r.device, dtype=torch.int) * T,
|
||||||
|
)
|
||||||
|
elif gen_strategy == "max_t":
|
||||||
|
tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda")
|
||||||
|
tokens_per_expert.fill_(total_tokens / E)
|
||||||
|
elif gen_strategy == "first_t":
|
||||||
|
tokens_per_expert = torch.zeros(size=(E,), dtype=torch.int32, device="cuda")
|
||||||
|
tokens_per_expert[0] = min(T, total_tokens)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown generation strategy: {gen_strategy}")
|
||||||
|
return y, tokens_per_expert
|
||||||
|
|
||||||
|
dataset_count = 4
|
||||||
|
# Pre-generate different input matrices for each iteration to avoid cache effects
|
||||||
|
data_sets = [generate_data(i) for i in range(dataset_count)]
|
||||||
|
|
||||||
|
# Warmup
|
||||||
|
y, tokens_per_expert = data_sets[0]
|
||||||
|
for _ in range(num_warmups):
|
||||||
|
kernel(
|
||||||
|
y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G
|
||||||
|
)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
start_event = torch.cuda.Event(enable_timing=True)
|
||||||
|
end_event = torch.cuda.Event(enable_timing=True)
|
||||||
|
|
||||||
|
# Benchmark
|
||||||
|
latencies: list[float] = []
|
||||||
|
for _ in range(runs):
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
start_event.record()
|
||||||
|
for i in range(iterations_per_run):
|
||||||
|
y, tokens_per_expert = data_sets[i % dataset_count]
|
||||||
|
kernel(
|
||||||
|
y,
|
||||||
|
tokens_per_expert,
|
||||||
|
num_parallel_tokens=num_parallel_tokens,
|
||||||
|
group_size=G,
|
||||||
|
)
|
||||||
|
end_event.record()
|
||||||
|
end_event.synchronize()
|
||||||
|
|
||||||
|
total_time_ms = start_event.elapsed_time(end_event)
|
||||||
|
per_iter_time_ms = total_time_ms / iterations_per_run
|
||||||
|
latencies.append(per_iter_time_ms)
|
||||||
|
|
||||||
|
# Use median instead of average for better outlier handling
|
||||||
|
median_time_ms = np.median(latencies)
|
||||||
|
median_time_s = median_time_ms / 1000
|
||||||
|
|
||||||
|
# Calculate actual work done (using first dataset for consistency)
|
||||||
|
_, tokens_per_expert = data_sets[0]
|
||||||
|
actual_tokens = tokens_per_expert.sum().item()
|
||||||
|
actual_elements = actual_tokens * H
|
||||||
|
|
||||||
|
# GFLOPS: operations per element = exp + 3 muls + 1 div + quantization ops ≈ 8 ops
|
||||||
|
ops_per_element = 8
|
||||||
|
total_ops = actual_elements * ops_per_element
|
||||||
|
gflops = total_ops / median_time_s / 1e9
|
||||||
|
|
||||||
|
# Memory bandwidth: bfloat16 inputs (2 bytes), fp8 output (1 byte), scales (4 bytes)
|
||||||
|
input_bytes = actual_tokens * 2 * H * 2 # 2*H bfloat16 inputs
|
||||||
|
output_bytes = actual_tokens * H * 1 # H fp8 outputs
|
||||||
|
scale_bytes = actual_tokens * (H // G) * 4 # scales in float32
|
||||||
|
total_bytes = input_bytes + output_bytes + scale_bytes
|
||||||
|
memory_bw = total_bytes / median_time_s / 1e9
|
||||||
|
|
||||||
|
HOPPER_BANDWIDTH_TBPS = 3.35
|
||||||
|
return (
|
||||||
|
median_time_ms,
|
||||||
|
gflops,
|
||||||
|
memory_bw,
|
||||||
|
(memory_bw / (HOPPER_BANDWIDTH_TBPS * 1024)) * 100,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def create_comparison_plot(
|
||||||
|
ratio, cuda_times, baseline_times, config_labels, strategy_name, id
|
||||||
|
):
|
||||||
|
"""Create a comparison plot for a specific generation strategy"""
|
||||||
|
fig, ax = plt.subplots(1, 1, figsize=(16, 6))
|
||||||
|
|
||||||
|
# Configure x-axis positions
|
||||||
|
x = np.arange(len(config_labels))
|
||||||
|
width = 0.35
|
||||||
|
|
||||||
|
# Execution Time plot (lower is better)
|
||||||
|
ax.bar(
|
||||||
|
x - width / 2, cuda_times, width, label="CUDA Kernel", alpha=0.8, color="blue"
|
||||||
|
)
|
||||||
|
ax.bar(
|
||||||
|
x + width / 2,
|
||||||
|
baseline_times,
|
||||||
|
width,
|
||||||
|
label="Baseline",
|
||||||
|
alpha=0.8,
|
||||||
|
color="orange",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add speedup labels over each bar pair
|
||||||
|
for i in range(len(x)):
|
||||||
|
speedup = ratio[i]
|
||||||
|
max_height = max(cuda_times[i], baseline_times[i])
|
||||||
|
ax.text(
|
||||||
|
x[i],
|
||||||
|
max_height + max_height * 0.02,
|
||||||
|
f"{speedup:.2f}x",
|
||||||
|
ha="center",
|
||||||
|
va="bottom",
|
||||||
|
fontweight="bold",
|
||||||
|
fontsize=9,
|
||||||
|
)
|
||||||
|
|
||||||
|
ax.set_xlabel("Configuration")
|
||||||
|
ax.set_ylabel("% Utilization")
|
||||||
|
ax.set_title(
|
||||||
|
f"Memory Bandwidth Utilization (%) - {strategy_name}\n(Higher is Better)"
|
||||||
|
)
|
||||||
|
ax.set_xticks(x)
|
||||||
|
ax.set_xticklabels(config_labels, rotation=45, ha="right")
|
||||||
|
ax.legend()
|
||||||
|
ax.grid(True, alpha=0.3)
|
||||||
|
|
||||||
|
plt.tight_layout()
|
||||||
|
return fig, ax
|
||||||
|
|
||||||
|
|
||||||
|
def create_combined_plot(all_results):
|
||||||
|
"""Create a combined plot with all strategies in one PNG"""
|
||||||
|
num_strategies = len(all_results)
|
||||||
|
fig, axes = plt.subplots(num_strategies, 1, figsize=(20, 6 * num_strategies))
|
||||||
|
|
||||||
|
if num_strategies == 1:
|
||||||
|
axes = [axes]
|
||||||
|
|
||||||
|
for idx, (
|
||||||
|
strategy_name,
|
||||||
|
ratio,
|
||||||
|
cuda_times,
|
||||||
|
baseline_times,
|
||||||
|
config_labels,
|
||||||
|
) in enumerate(all_results):
|
||||||
|
ax = axes[idx]
|
||||||
|
|
||||||
|
# Configure x-axis positions
|
||||||
|
x = np.arange(len(config_labels))
|
||||||
|
width = 0.35
|
||||||
|
|
||||||
|
# Execution Time plot (lower is better)
|
||||||
|
ax.bar(
|
||||||
|
x - width / 2,
|
||||||
|
cuda_times,
|
||||||
|
width,
|
||||||
|
label="CUDA Kernel",
|
||||||
|
alpha=0.8,
|
||||||
|
color="blue",
|
||||||
|
)
|
||||||
|
ax.bar(
|
||||||
|
x + width / 2,
|
||||||
|
baseline_times,
|
||||||
|
width,
|
||||||
|
label="Baseline",
|
||||||
|
alpha=0.8,
|
||||||
|
color="orange",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add speedup labels over each bar pair
|
||||||
|
for i in range(len(x)):
|
||||||
|
speedup = ratio[i]
|
||||||
|
max_height = max(cuda_times[i], baseline_times[i])
|
||||||
|
ax.text(
|
||||||
|
x[i],
|
||||||
|
max_height + max_height * 0.02,
|
||||||
|
f"{speedup:.2f}x",
|
||||||
|
ha="center",
|
||||||
|
va="bottom",
|
||||||
|
fontweight="bold",
|
||||||
|
fontsize=9,
|
||||||
|
)
|
||||||
|
|
||||||
|
ax.set_xlabel("Configuration")
|
||||||
|
ax.set_ylabel("% Utilization")
|
||||||
|
ax.set_title(
|
||||||
|
f"Memory Bandwidth Utilization (%) - {strategy_name}\n(Higher is Better)"
|
||||||
|
)
|
||||||
|
ax.set_xticks(x)
|
||||||
|
ax.set_xticklabels(config_labels, rotation=45, ha="right")
|
||||||
|
ax.legend()
|
||||||
|
ax.grid(True, alpha=0.3)
|
||||||
|
|
||||||
|
plt.tight_layout()
|
||||||
|
filename = "../../silu_bench/silu_benchmark_combined.png"
|
||||||
|
plt.savefig(filename, dpi=300, bbox_inches="tight")
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
return filename
|
||||||
|
|
||||||
|
|
||||||
|
outer_dim = 7168
|
||||||
|
configs = [
|
||||||
|
# DeepSeekV3 Configs
|
||||||
|
(8, 1024, 7168),
|
||||||
|
# DeepSeekV3 Configs
|
||||||
|
(32, 1024, 7168),
|
||||||
|
# DeepSeekV3 Configs
|
||||||
|
(256, 1024, 7168),
|
||||||
|
]
|
||||||
|
|
||||||
|
runs = 100
|
||||||
|
num_warmups = 20
|
||||||
|
|
||||||
|
strategy_descriptions = {
|
||||||
|
"uniform": "Uniform Random",
|
||||||
|
"max_t": "Even Assignment",
|
||||||
|
"first_t": "experts[0] = T, experts[1:] = 0",
|
||||||
|
}
|
||||||
|
|
||||||
|
print(f"GPU: {torch.cuda.get_device_name()}")
|
||||||
|
print(f"Testing strategies: {', '.join(strategies)}")
|
||||||
|
print(f"Configurations: {len(configs)} configs")
|
||||||
|
|
||||||
|
all_results = []
|
||||||
|
|
||||||
|
# Run benchmarks for each strategy
|
||||||
|
for id, strategy in enumerate(strategies):
|
||||||
|
print(f"\n{'=' * 60}")
|
||||||
|
print(f"Testing strategy: {strategy_descriptions[strategy]}")
|
||||||
|
print(f"{'=' * 60}")
|
||||||
|
|
||||||
|
# Collect benchmark data for both algorithms
|
||||||
|
config_labels = []
|
||||||
|
config_x_axis = []
|
||||||
|
all_cuda_results = []
|
||||||
|
all_baseline_results = []
|
||||||
|
all_ratios = []
|
||||||
|
|
||||||
|
for E, T, H in configs:
|
||||||
|
total_tokens_config = [8 * E, 16 * E, 32 * E, 64 * E, 128 * E, 256 * E]
|
||||||
|
config_x_axis.append(total_tokens_config)
|
||||||
|
|
||||||
|
cuda_results = []
|
||||||
|
baseline_results = []
|
||||||
|
ratios = []
|
||||||
|
|
||||||
|
for total_tokens in total_tokens_config:
|
||||||
|
config_label = f"E={E},T={T},H={H},TT={total_tokens}"
|
||||||
|
config_labels.append(config_label)
|
||||||
|
|
||||||
|
# CUDA kernel results
|
||||||
|
time_ms_cuda, gflops, gbps, perc = benchmark(
|
||||||
|
silu_mul_fp8_quant_deep_gemm_cuda,
|
||||||
|
E,
|
||||||
|
T,
|
||||||
|
H,
|
||||||
|
total_tokens,
|
||||||
|
runs=runs,
|
||||||
|
num_warmups=num_warmups,
|
||||||
|
gen_strategy=strategy,
|
||||||
|
)
|
||||||
|
cuda_results.append((time_ms_cuda, gflops, gbps, perc))
|
||||||
|
|
||||||
|
# Baseline results
|
||||||
|
time_ms_triton, gflops, gbps, perc = benchmark(
|
||||||
|
silu_mul_fp8_quant_deep_gemm_triton,
|
||||||
|
E,
|
||||||
|
T,
|
||||||
|
H,
|
||||||
|
total_tokens,
|
||||||
|
runs=runs,
|
||||||
|
num_warmups=num_warmups,
|
||||||
|
gen_strategy=strategy,
|
||||||
|
)
|
||||||
|
baseline_results.append((time_ms_triton, gflops, gbps, perc))
|
||||||
|
ratios.append(time_ms_triton / time_ms_cuda)
|
||||||
|
|
||||||
|
print(f"Completed: {config_label}")
|
||||||
|
all_cuda_results.append(cuda_results)
|
||||||
|
all_baseline_results.append(baseline_results)
|
||||||
|
all_ratios.append(ratios)
|
||||||
|
|
||||||
|
# Store results for combined plotting
|
||||||
|
all_results.append(
|
||||||
|
(
|
||||||
|
strategy_descriptions[strategy],
|
||||||
|
all_ratios,
|
||||||
|
all_cuda_results,
|
||||||
|
all_baseline_results,
|
||||||
|
config_labels,
|
||||||
|
config_x_axis,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Print summary table for this strategy
|
||||||
|
print(f"\nSummary Table - {strategy_descriptions[strategy]}:")
|
||||||
|
print(f"{'Config':<20} {'CUDA Time(ms)':<12} {'Base Time(ms)':<12} {'Speedup':<8}")
|
||||||
|
print("-" * 60)
|
||||||
|
|
||||||
|
for i, (E, T, H) in enumerate(configs):
|
||||||
|
speedup = baseline_results[i][0] / cuda_results[i][0]
|
||||||
|
config_label = f"E={E:3d},T={T:4d},H={H:4d}"
|
||||||
|
print(
|
||||||
|
f"{config_label:<20} {cuda_results[i][0]:8.5f} "
|
||||||
|
f"{baseline_results[i][0]:8.5f} {speedup:6.2f}x"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def create_total_tokens_plot(all_results):
|
||||||
|
num_strategies = len(all_results)
|
||||||
|
num_configs = len(configs)
|
||||||
|
|
||||||
|
# Create side-by-side subplots: 2 columns for speedup and bandwidth percentage
|
||||||
|
fig, axs = plt.subplots(
|
||||||
|
num_strategies, num_configs * 2, figsize=(28, 6 * num_strategies)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add main title to the entire figure
|
||||||
|
fig.suptitle(
|
||||||
|
"Performance Analysis: Speedup vs Bandwidth Utilization (Triton & CUDA)",
|
||||||
|
fontsize=16,
|
||||||
|
fontweight="bold",
|
||||||
|
y=0.98,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Handle single strategy case
|
||||||
|
if num_strategies == 1:
|
||||||
|
axs = axs.reshape(1, -1)
|
||||||
|
|
||||||
|
# Handle single config case
|
||||||
|
if num_configs == 1:
|
||||||
|
axs = axs.reshape(-1, 2)
|
||||||
|
|
||||||
|
for strategy_idx, result in enumerate(all_results):
|
||||||
|
(
|
||||||
|
strategy_name,
|
||||||
|
all_ratios,
|
||||||
|
all_cuda_results,
|
||||||
|
all_baseline_results,
|
||||||
|
config_labels,
|
||||||
|
config_x_axis,
|
||||||
|
) = result
|
||||||
|
|
||||||
|
for config_idx in range(num_configs):
|
||||||
|
# Speedup plot (left column)
|
||||||
|
ax_speedup = axs[strategy_idx, config_idx * 2]
|
||||||
|
# Bandwidth plot (right column)
|
||||||
|
ax_bandwidth = axs[strategy_idx, config_idx * 2 + 1]
|
||||||
|
|
||||||
|
E, T, H = configs[config_idx]
|
||||||
|
ratios = all_ratios[config_idx]
|
||||||
|
total_tokens_values = config_x_axis[config_idx]
|
||||||
|
|
||||||
|
# Extract CUDA and Triton bandwidth percentages
|
||||||
|
cuda_bandwidth_percentages = [
|
||||||
|
result[3] for result in all_cuda_results[config_idx]
|
||||||
|
]
|
||||||
|
triton_bandwidth_percentages = [
|
||||||
|
result[3] for result in all_baseline_results[config_idx]
|
||||||
|
]
|
||||||
|
|
||||||
|
# Plot speedup ratios vs total tokens (left plot)
|
||||||
|
ax_speedup.plot(
|
||||||
|
total_tokens_values, ratios, "bo-", linewidth=3, markersize=8
|
||||||
|
)
|
||||||
|
ax_speedup.set_title(
|
||||||
|
f"{strategy_name}\nSpeedup (CUDA/Triton)\nE={E}, T={T}, H={H}",
|
||||||
|
fontsize=12,
|
||||||
|
fontweight="bold",
|
||||||
|
)
|
||||||
|
ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
|
||||||
|
ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11)
|
||||||
|
ax_speedup.grid(True, alpha=0.3)
|
||||||
|
|
||||||
|
ax_bandwidth.plot(
|
||||||
|
total_tokens_values,
|
||||||
|
cuda_bandwidth_percentages,
|
||||||
|
"ro-",
|
||||||
|
linewidth=3,
|
||||||
|
markersize=8,
|
||||||
|
label="CUDA",
|
||||||
|
)
|
||||||
|
ax_bandwidth.plot(
|
||||||
|
total_tokens_values,
|
||||||
|
triton_bandwidth_percentages,
|
||||||
|
"go-",
|
||||||
|
linewidth=3,
|
||||||
|
markersize=8,
|
||||||
|
label="Triton",
|
||||||
|
)
|
||||||
|
ax_bandwidth.set_title(
|
||||||
|
f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}",
|
||||||
|
fontsize=12,
|
||||||
|
fontweight="bold",
|
||||||
|
)
|
||||||
|
ax_bandwidth.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
|
||||||
|
ax_bandwidth.set_ylabel(
|
||||||
|
"% of Peak Bandwidth", fontweight="bold", fontsize=11
|
||||||
|
)
|
||||||
|
ax_bandwidth.legend(prop={"weight": "bold"})
|
||||||
|
ax_bandwidth.grid(True, alpha=0.3)
|
||||||
|
|
||||||
|
# Format x-axis labels for both plots
|
||||||
|
for ax in [ax_speedup, ax_bandwidth]:
|
||||||
|
ax.set_xticks(total_tokens_values)
|
||||||
|
ax.set_xticklabels(
|
||||||
|
[
|
||||||
|
f"{tt // 1000}K" if tt >= 1000 else str(tt)
|
||||||
|
for tt in total_tokens_values
|
||||||
|
],
|
||||||
|
fontweight="bold",
|
||||||
|
)
|
||||||
|
# Make tick labels bold
|
||||||
|
for label in ax.get_xticklabels() + ax.get_yticklabels():
|
||||||
|
label.set_fontweight("bold")
|
||||||
|
|
||||||
|
# Add value labels on speedup points
|
||||||
|
for x, y in zip(total_tokens_values, ratios):
|
||||||
|
ax_speedup.annotate(
|
||||||
|
f"{y:.2f}x",
|
||||||
|
(x, y),
|
||||||
|
textcoords="offset points",
|
||||||
|
xytext=(0, 12),
|
||||||
|
ha="center",
|
||||||
|
fontsize=10,
|
||||||
|
fontweight="bold",
|
||||||
|
bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add value labels on CUDA bandwidth points
|
||||||
|
for x, y in zip(total_tokens_values, cuda_bandwidth_percentages):
|
||||||
|
ax_bandwidth.annotate(
|
||||||
|
f"{y:.1f}%",
|
||||||
|
(x, y),
|
||||||
|
textcoords="offset points",
|
||||||
|
xytext=(0, 12),
|
||||||
|
ha="center",
|
||||||
|
fontsize=9,
|
||||||
|
fontweight="bold",
|
||||||
|
bbox=dict(boxstyle="round,pad=0.2", facecolor="red", alpha=0.3),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Add value labels on Triton bandwidth points
|
||||||
|
for x, y in zip(total_tokens_values, triton_bandwidth_percentages):
|
||||||
|
ax_bandwidth.annotate(
|
||||||
|
f"{y:.1f}%",
|
||||||
|
(x, y),
|
||||||
|
textcoords="offset points",
|
||||||
|
xytext=(0, -15),
|
||||||
|
ha="center",
|
||||||
|
fontsize=9,
|
||||||
|
fontweight="bold",
|
||||||
|
bbox=dict(boxstyle="round,pad=0.2", facecolor="green", alpha=0.3),
|
||||||
|
)
|
||||||
|
|
||||||
|
plt.tight_layout()
|
||||||
|
plt.subplots_adjust(top=0.93) # Make room for main title
|
||||||
|
filename = "silu_benchmark_total_tokens.png"
|
||||||
|
plt.savefig(filename, dpi=300, bbox_inches="tight")
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
return filename
|
||||||
|
|
||||||
|
|
||||||
|
# Create combined plot with all strategies
|
||||||
|
combined_plot_filename = create_total_tokens_plot(all_results)
|
||||||
|
|
||||||
|
print(f"\n{'=' * 60}")
|
||||||
|
print("Benchmark Complete!")
|
||||||
|
print(f"Generated combined plot: {combined_plot_filename}")
|
||||||
|
print(f"{'=' * 60}")
|
||||||
@ -3,16 +3,17 @@
|
|||||||
|
|
||||||
import csv
|
import csv
|
||||||
import os
|
import os
|
||||||
import random
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import flashinfer
|
import flashinfer
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
|
from vllm.utils import round_up
|
||||||
|
|
||||||
# KV Cache Layout for TRT-LLM
|
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
|
||||||
# kv_cache_shape = (num_blocks, 2, num_kv_heads, page_size, head_dim)
|
FP8_DTYPE = torch.float8_e4m3fn
|
||||||
|
FP4_DTYPE = torch.uint8
|
||||||
|
|
||||||
|
|
||||||
def to_float8(x, dtype=torch.float8_e4m3fn):
|
def to_float8(x, dtype=torch.float8_e4m3fn):
|
||||||
@ -26,65 +27,106 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
|
|||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def benchmark_decode(
|
def benchmark_decode(
|
||||||
num_seqs,
|
dtype: torch.dtype,
|
||||||
max_seq_len,
|
quant_dtypes: tuple[
|
||||||
page_size=16,
|
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
|
||||||
dtype=torch.bfloat16,
|
],
|
||||||
kv_layout="HND",
|
batch_size: int,
|
||||||
num_kv_heads=8,
|
max_seq_len: int,
|
||||||
kv_cache_dtype="auto",
|
num_heads: tuple[int, int] = (64, 8),
|
||||||
head_dim=128,
|
head_size: int = 128,
|
||||||
warmup=10,
|
kv_layout: str = "HND",
|
||||||
trials=20,
|
block_size: int = 16,
|
||||||
|
warmup: int = 10,
|
||||||
|
trials: int = 20,
|
||||||
):
|
):
|
||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
device = "cuda"
|
|
||||||
torch.manual_seed(0)
|
torch.manual_seed(0)
|
||||||
|
|
||||||
HEAD_GRP_SIZE = 8
|
q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes
|
||||||
MAX_SEQ_LEN = max_seq_len
|
q_quant_dtype = q_quant_dtype or dtype
|
||||||
|
kv_quant_dtype = kv_quant_dtype or dtype
|
||||||
|
o_quant_dtype = o_quant_dtype or dtype
|
||||||
|
|
||||||
|
num_qo_heads, num_kv_heads = num_heads
|
||||||
|
assert num_qo_heads % num_kv_heads == 0
|
||||||
|
|
||||||
|
sm_scale = float(1.0 / (head_size**0.5))
|
||||||
|
|
||||||
# large number to reduce kv_cache reuse
|
# large number to reduce kv_cache reuse
|
||||||
NUM_BLOCKS = int(256000 / page_size)
|
NUM_BLOCKS = int(256000 / block_size)
|
||||||
|
|
||||||
workspace_buffer = torch.empty(1024 * 1024 * 1024, dtype=torch.int8, device=device)
|
kv_cache_shape = None
|
||||||
|
if kv_layout == "NHD":
|
||||||
|
kv_cache_shape = (NUM_BLOCKS, 2, block_size, num_kv_heads, head_size)
|
||||||
|
elif kv_layout == "HND":
|
||||||
|
kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Invalid kv_layout: {kv_layout}")
|
||||||
|
|
||||||
# For decode, batch_size is num_decode_token
|
# Always using 1.0 scale to reflect the real perf in benchmarking
|
||||||
num_qo_heads = num_kv_heads * HEAD_GRP_SIZE
|
q_scale = 1.0
|
||||||
sm_scale = float(1.0 / (head_dim**0.5))
|
ref_query = torch.randn(batch_size, num_qo_heads, head_size, dtype=dtype)
|
||||||
q = torch.randn(num_seqs, num_qo_heads, head_dim, device=device, dtype=dtype)
|
if q_quant_dtype == FP8_DTYPE:
|
||||||
kv_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
|
query, _ = to_float8(ref_query)
|
||||||
|
else:
|
||||||
|
query = ref_query
|
||||||
|
|
||||||
max_kv_len = max(kv_lens)
|
kv_lens = torch.randint(1, max_seq_len, (batch_size,), dtype=torch.int32)
|
||||||
kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int, device=device)
|
kv_lens[-1] = max_seq_len
|
||||||
max_num_blocks_per_seq = (max_kv_len + page_size - 1) // page_size
|
|
||||||
|
|
||||||
block_tables = torch.randint(
|
seq_lens = kv_lens
|
||||||
0, NUM_BLOCKS, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
|
max_seq_len = torch.max(seq_lens).item()
|
||||||
)
|
|
||||||
|
|
||||||
kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, page_size, head_dim)
|
# Always using 1.0 scale to reflect the real perf in benchmarking
|
||||||
kv_cache = torch.randn(size=kv_cache_shape, device=device, dtype=dtype)
|
|
||||||
k_scale = v_scale = 1.0
|
k_scale = v_scale = 1.0
|
||||||
|
ref_kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
|
||||||
|
if kv_quant_dtype == FP8_DTYPE:
|
||||||
|
kv_cache, _ = to_float8(ref_kv_cache)
|
||||||
|
else:
|
||||||
|
kv_cache = ref_kv_cache
|
||||||
|
|
||||||
if kv_cache_dtype.startswith("fp8"):
|
max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
|
||||||
kv_cache, _ = to_float8(kv_cache)
|
block_tables = torch.randint(
|
||||||
|
0, NUM_BLOCKS, (batch_size, max_num_blocks_per_seq), dtype=torch.int32
|
||||||
|
)
|
||||||
|
kv_indptr = [0]
|
||||||
|
kv_indices = []
|
||||||
|
kv_last_page_lens = []
|
||||||
|
for i in range(batch_size):
|
||||||
|
seq_len = seq_lens[i]
|
||||||
|
assert seq_len > 0
|
||||||
|
num_blocks = (seq_len + block_size - 1) // block_size
|
||||||
|
kv_indices.extend(block_tables[i, :num_blocks])
|
||||||
|
kv_indptr.append(kv_indptr[-1] + num_blocks)
|
||||||
|
kv_last_page_len = seq_len % block_size
|
||||||
|
if kv_last_page_len == 0:
|
||||||
|
kv_last_page_len = block_size
|
||||||
|
kv_last_page_lens.append(kv_last_page_len)
|
||||||
|
|
||||||
output_trtllm = torch.empty(q.shape, dtype=dtype)
|
kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
|
||||||
|
kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
|
||||||
|
kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
|
||||||
|
workspace_buffer = torch.zeros(1024 * 1024 * 1024, dtype=torch.int8)
|
||||||
|
|
||||||
# Benchmark TRT decode
|
wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
|
||||||
def trt_decode():
|
workspace_buffer,
|
||||||
return flashinfer.decode.trtllm_batch_decode_with_kv_cache(
|
kv_layout,
|
||||||
q,
|
use_tensor_cores=True,
|
||||||
kv_cache,
|
)
|
||||||
workspace_buffer,
|
wrapper.plan(
|
||||||
block_tables,
|
kv_indptr,
|
||||||
kv_lens_tensor,
|
kv_indices,
|
||||||
max_kv_len,
|
kv_last_page_lens,
|
||||||
bmm1_scale=k_scale * sm_scale,
|
num_qo_heads,
|
||||||
bmm2_scale=v_scale,
|
num_kv_heads,
|
||||||
out=output_trtllm,
|
head_size,
|
||||||
)
|
block_size,
|
||||||
|
"NONE",
|
||||||
|
sm_scale=sm_scale,
|
||||||
|
q_data_type=dtype,
|
||||||
|
kv_data_type=dtype,
|
||||||
|
)
|
||||||
|
|
||||||
def time_fn(fn, warmup=10, trials=20):
|
def time_fn(fn, warmup=10, trials=20):
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
@ -101,74 +143,72 @@ def benchmark_decode(
|
|||||||
times.append(start.elapsed_time(end)) # ms
|
times.append(start.elapsed_time(end)) # ms
|
||||||
return sum(times) / len(times), torch.std(torch.tensor(times))
|
return sum(times) / len(times), torch.std(torch.tensor(times))
|
||||||
|
|
||||||
# TRT Decode
|
o_scale = 1.0
|
||||||
trt_mean, trt_std = time_fn(trt_decode)
|
o_sf_scale = None
|
||||||
|
output_baseline = torch.empty(ref_query.shape, dtype=dtype)
|
||||||
kv_indptr = [0]
|
if o_quant_dtype == FP4_DTYPE:
|
||||||
kv_indices = []
|
o_sf_scale = 500.0
|
||||||
kv_last_page_lens = []
|
output_trtllm = flashinfer.utils.FP4Tensor(
|
||||||
for i in range(num_seqs):
|
torch.empty(query.shape[:-1] + (query.shape[-1] // 2,), dtype=torch.uint8),
|
||||||
seq_len = kv_lens[i]
|
torch.empty(
|
||||||
assert seq_len > 0
|
(
|
||||||
num_blocks = (seq_len + page_size - 1) // page_size
|
round_up(query.shape[0], 128),
|
||||||
kv_indices.extend(block_tables[i, :num_blocks])
|
round_up(query.shape[1] * query.shape[2] // 16, 4),
|
||||||
kv_indptr.append(kv_indptr[-1] + num_blocks)
|
),
|
||||||
kv_last_page_len = seq_len % page_size
|
dtype=torch.float8_e4m3fn,
|
||||||
if kv_last_page_len == 0:
|
),
|
||||||
kv_last_page_len = page_size
|
)
|
||||||
kv_last_page_lens.append(kv_last_page_len)
|
else:
|
||||||
|
output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
|
||||||
kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
|
|
||||||
kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
|
|
||||||
kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
|
|
||||||
|
|
||||||
output_baseline = torch.empty(q.shape, dtype=dtype)
|
|
||||||
|
|
||||||
wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
|
|
||||||
workspace_buffer,
|
|
||||||
kv_layout,
|
|
||||||
use_tensor_cores=((num_qo_heads // num_kv_heads) > 4),
|
|
||||||
)
|
|
||||||
|
|
||||||
wrapper.plan(
|
|
||||||
kv_indptr,
|
|
||||||
kv_indices,
|
|
||||||
kv_last_page_lens,
|
|
||||||
num_qo_heads,
|
|
||||||
num_kv_heads,
|
|
||||||
head_dim,
|
|
||||||
page_size,
|
|
||||||
"NONE",
|
|
||||||
q_data_type=dtype,
|
|
||||||
kv_data_type=torch.float8_e4m3fn if kv_cache_dtype.startswith("fp8") else dtype,
|
|
||||||
)
|
|
||||||
|
|
||||||
def baseline_decode():
|
def baseline_decode():
|
||||||
return wrapper.run(q, kv_cache, sm_scale, k_scale, v_scale, output_baseline)
|
return wrapper.run(
|
||||||
|
ref_query,
|
||||||
|
ref_kv_cache,
|
||||||
|
k_scale=k_scale,
|
||||||
|
v_scale=v_scale,
|
||||||
|
out=output_baseline,
|
||||||
|
)
|
||||||
|
|
||||||
|
def trtllm_decode():
|
||||||
|
return flashinfer.decode.trtllm_batch_decode_with_kv_cache(
|
||||||
|
query=query,
|
||||||
|
kv_cache=kv_cache,
|
||||||
|
workspace_buffer=workspace_buffer,
|
||||||
|
block_tables=block_tables,
|
||||||
|
seq_lens=seq_lens,
|
||||||
|
max_seq_len=max_seq_len,
|
||||||
|
bmm1_scale=q_scale * k_scale * sm_scale,
|
||||||
|
bmm2_scale=v_scale / o_scale,
|
||||||
|
o_sf_scale=o_sf_scale,
|
||||||
|
out=output_trtllm,
|
||||||
|
)
|
||||||
|
|
||||||
baseline_mean, baseline_std = time_fn(baseline_decode)
|
baseline_mean, baseline_std = time_fn(baseline_decode)
|
||||||
|
trtllm_mean, trtllm_std = time_fn(trtllm_decode)
|
||||||
|
|
||||||
# Calculate percentage speedup (positive means TRT is faster)
|
# Calculate percentage speedup (positive means TRT is faster)
|
||||||
speedup_percent = (baseline_mean - trt_mean) / baseline_mean
|
speedup_percent = (baseline_mean - trtllm_mean) / baseline_mean
|
||||||
|
|
||||||
print(
|
print(
|
||||||
f"\t{num_seqs}\t{max_seq_len}\t{trt_mean:.3f}\t{trt_std.item():.3f}"
|
f"\t{batch_size}\t{max_seq_len}\t{trtllm_mean:.3f}\t{trtllm_std.item():.3f}"
|
||||||
f"\t{baseline_mean:.3f}\t{baseline_std.item():.3f}\t{speedup_percent:.3f}"
|
f"\t{baseline_mean:.3f}\t{baseline_std.item():.3f}\t{speedup_percent:.3f}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Return results for CSV writing
|
# Return results for CSV writing
|
||||||
return {
|
return {
|
||||||
"num_seqs": num_seqs,
|
"batch_size": batch_size,
|
||||||
"trt_mean": trt_mean,
|
"trtllm_mean": trtllm_mean,
|
||||||
"trt_std": trt_std.item(),
|
"trtllm_std": trtllm_std.item(),
|
||||||
"baseline_mean": baseline_mean,
|
"baseline_mean": baseline_mean,
|
||||||
"baseline_std": baseline_std.item(),
|
"baseline_std": baseline_std.item(),
|
||||||
"speedup_percent": speedup_percent,
|
"speedup_percent": speedup_percent,
|
||||||
"q_dtype": str(dtype),
|
"q_dtype": str(q_quant_dtype),
|
||||||
"kv_cache_dtype": kv_cache_dtype,
|
"kv_cache_dtype": str(kv_quant_dtype),
|
||||||
"page_size": page_size,
|
"output_dtype": str(o_quant_dtype),
|
||||||
|
"block_size": block_size,
|
||||||
"num_kv_heads": num_kv_heads,
|
"num_kv_heads": num_kv_heads,
|
||||||
"head_dim": head_dim,
|
"head_size": head_size,
|
||||||
"max_seq_len": max_seq_len,
|
"max_seq_len": max_seq_len,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -180,17 +220,18 @@ def write_results_to_csv(results, filename=None):
|
|||||||
filename = f"flashinfer_trtllm_benchmark_{timestamp}.csv"
|
filename = f"flashinfer_trtllm_benchmark_{timestamp}.csv"
|
||||||
|
|
||||||
fieldnames = [
|
fieldnames = [
|
||||||
"num_seqs",
|
"batch_size",
|
||||||
"trt_mean",
|
"trtllm_mean",
|
||||||
"trt_std",
|
"trtllm_std",
|
||||||
"baseline_mean",
|
"baseline_mean",
|
||||||
"baseline_std",
|
"baseline_std",
|
||||||
"speedup_percent",
|
"speedup_percent",
|
||||||
"q_dtype",
|
"q_dtype",
|
||||||
"kv_cache_dtype",
|
"kv_cache_dtype",
|
||||||
"page_size",
|
"output_dtype",
|
||||||
|
"block_size",
|
||||||
"num_kv_heads",
|
"num_kv_heads",
|
||||||
"head_dim",
|
"head_size",
|
||||||
"max_seq_len",
|
"max_seq_len",
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -209,45 +250,44 @@ def write_results_to_csv(results, filename=None):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
num_seqs = [1, 4, 8, 16, 32, 64, 128, 256]
|
batch_sizes = [1, 4, 8, 16, 32, 64, 128, 256]
|
||||||
max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
|
max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
|
||||||
all_results = []
|
all_results = []
|
||||||
|
|
||||||
print(
|
dtype = torch.bfloat16
|
||||||
"Running benchmark for q_dtype = bfloat16, kv_cache_dtype: bfloat16, "
|
quant_dtypes = [
|
||||||
"output_dtype: bfloat16"
|
# (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
|
||||||
)
|
(None, None, None),
|
||||||
print(
|
(None, FP8_DTYPE, None),
|
||||||
"\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t"
|
(FP8_DTYPE, FP8_DTYPE, None),
|
||||||
"baseline_std\tspeedup_percent"
|
(FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
|
||||||
)
|
(FP8_DTYPE, FP8_DTYPE, FP4_DTYPE),
|
||||||
for max_seq_len in max_seq_lens:
|
]
|
||||||
for bs in num_seqs:
|
|
||||||
result = benchmark_decode(
|
|
||||||
bs,
|
|
||||||
max_seq_len,
|
|
||||||
dtype=torch.bfloat16,
|
|
||||||
kv_cache_dtype="auto",
|
|
||||||
)
|
|
||||||
all_results.append(result)
|
|
||||||
|
|
||||||
print(
|
for quant_dtype in quant_dtypes:
|
||||||
"Running benchmark for q_dtype = bfloat16, kv_cache_dtype: fp8, "
|
q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtype
|
||||||
"output_dtype: bfloat16"
|
q_quant_dtype = q_quant_dtype or dtype
|
||||||
)
|
kv_quant_dtype = kv_quant_dtype or dtype
|
||||||
print(
|
o_quant_dtype = o_quant_dtype or dtype
|
||||||
"\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t"
|
|
||||||
"baseline_std\tspeedup_percent"
|
print(
|
||||||
)
|
f"Running benchmark for q_dtype = {q_quant_dtype}, "
|
||||||
for max_seq_len in max_seq_lens:
|
f"kv_cache_dtype: {kv_quant_dtype}, "
|
||||||
for bs in num_seqs:
|
f"output_dtype: {o_quant_dtype}"
|
||||||
result = benchmark_decode(
|
)
|
||||||
bs,
|
print(
|
||||||
max_seq_len,
|
"\tbatch_size\tmax_seq_len\ttrtllm_mean\ttrtllm_std\tbaseline_mean\t"
|
||||||
dtype=torch.bfloat16,
|
"baseline_std\tspeedup_percent"
|
||||||
kv_cache_dtype="fp8",
|
)
|
||||||
)
|
for max_seq_len in max_seq_lens:
|
||||||
all_results.append(result)
|
for bs in batch_sizes:
|
||||||
|
result = benchmark_decode(
|
||||||
|
dtype=dtype,
|
||||||
|
quant_dtypes=quant_dtype,
|
||||||
|
batch_size=bs,
|
||||||
|
max_seq_len=max_seq_len,
|
||||||
|
)
|
||||||
|
all_results.append(result)
|
||||||
|
|
||||||
# Write all results to CSV
|
# Write all results to CSV
|
||||||
write_results_to_csv(all_results)
|
write_results_to_csv(all_results)
|
||||||
|
|||||||
@ -3,16 +3,17 @@
|
|||||||
|
|
||||||
import csv
|
import csv
|
||||||
import os
|
import os
|
||||||
import random
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import flashinfer
|
import flashinfer
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
|
from vllm.utils import round_up
|
||||||
|
|
||||||
# KV Cache Layout for TRT-LLM
|
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
|
||||||
# kv_cache_shape = (num_blocks, 2, num_kv_heads, page_size, head_dim)
|
FP8_DTYPE = torch.float8_e4m3fn
|
||||||
|
FP4_DTYPE = torch.uint8
|
||||||
|
|
||||||
|
|
||||||
def to_float8(x, dtype=torch.float8_e4m3fn):
|
def to_float8(x, dtype=torch.float8_e4m3fn):
|
||||||
@ -26,84 +27,100 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
|
|||||||
|
|
||||||
@torch.no_grad()
|
@torch.no_grad()
|
||||||
def benchmark_prefill(
|
def benchmark_prefill(
|
||||||
num_seqs,
|
dtype: torch.dtype,
|
||||||
max_seq_len,
|
quant_dtypes: tuple[
|
||||||
page_size=16,
|
Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
|
||||||
dtype=torch.bfloat16,
|
],
|
||||||
kv_layout="HND",
|
batch_size: int,
|
||||||
num_kv_heads=8,
|
max_seq_len: int,
|
||||||
kv_cache_dtype="auto",
|
num_heads: tuple[int, int] = (64, 8),
|
||||||
head_dim=128,
|
head_size: int = 128,
|
||||||
warmup=10,
|
kv_layout: str = "HND",
|
||||||
trials=20,
|
block_size: int = 16,
|
||||||
|
warmup: int = 10,
|
||||||
|
trials: int = 20,
|
||||||
):
|
):
|
||||||
torch.set_default_device("cuda")
|
torch.set_default_device("cuda")
|
||||||
torch.manual_seed(0)
|
torch.manual_seed(0)
|
||||||
|
|
||||||
HEAD_GRP_SIZE = 8
|
q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes
|
||||||
MAX_SEQ_LEN = max_seq_len
|
q_quant_dtype = q_quant_dtype or dtype
|
||||||
|
kv_quant_dtype = kv_quant_dtype or dtype
|
||||||
|
o_quant_dtype = o_quant_dtype or dtype
|
||||||
|
|
||||||
|
max_q_len = max_kv_len = max_seq_len
|
||||||
|
|
||||||
|
num_qo_heads, num_kv_heads = num_heads
|
||||||
|
assert num_qo_heads % num_kv_heads == 0
|
||||||
|
|
||||||
|
sm_scale = float(1.0 / (head_size**0.5))
|
||||||
|
|
||||||
# large number to reduce kv_cache reuse
|
# large number to reduce kv_cache reuse
|
||||||
NUM_BLOCKS = int(256000 / page_size)
|
NUM_BLOCKS = int(256000 / block_size)
|
||||||
|
|
||||||
workspace_buffer = torch.empty(1024 * 1024 * 1024, dtype=torch.int8)
|
kv_cache_shape = None
|
||||||
|
if kv_layout == "NHD":
|
||||||
|
kv_cache_shape = (NUM_BLOCKS, 2, block_size, num_kv_heads, head_size)
|
||||||
|
elif kv_layout == "HND":
|
||||||
|
kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, block_size, head_size)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Invalid kv_layout: {kv_layout}")
|
||||||
|
|
||||||
num_qo_heads = num_kv_heads * HEAD_GRP_SIZE
|
q_lens = torch.randint(1, max_q_len, (batch_size,), dtype=torch.int32)
|
||||||
sm_scale = float(1.0 / (head_dim**0.5))
|
q_lens[-1] = max_q_len
|
||||||
|
|
||||||
q_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
|
|
||||||
q_lens[-1] = MAX_SEQ_LEN
|
|
||||||
max_q_len = max(q_lens)
|
|
||||||
q_indptr = torch.cat(
|
q_indptr = torch.cat(
|
||||||
[
|
[
|
||||||
torch.tensor([0], dtype=torch.int32),
|
torch.tensor([0], dtype=torch.int32),
|
||||||
torch.cumsum(
|
torch.cumsum(q_lens, dim=0, dtype=torch.int32),
|
||||||
torch.tensor(q_lens, dtype=torch.int32), dim=0, dtype=torch.int32
|
|
||||||
),
|
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
q = torch.randn(sum(q_lens), num_qo_heads, head_dim, dtype=dtype)
|
|
||||||
|
|
||||||
kv_lens = [random.randint(0, MAX_SEQ_LEN) for _ in range(num_seqs)]
|
# Always using 1.0 scale to reflect the real perf in benchmarking
|
||||||
kv_lens[-1] = MAX_SEQ_LEN
|
q_scale = 1.0
|
||||||
|
ref_query = torch.randn(
|
||||||
seq_lens = [q_len + kv_len for q_len, kv_len in zip(q_lens, kv_lens)]
|
torch.sum(q_lens).item(), num_qo_heads, head_size, dtype=dtype
|
||||||
max_seq_len = max(seq_lens)
|
|
||||||
seq_lens_tensor = torch.tensor(seq_lens, dtype=torch.int32)
|
|
||||||
|
|
||||||
max_num_blocks_per_seq = (max_seq_len + page_size - 1) // page_size
|
|
||||||
block_tables = torch.randint(
|
|
||||||
0, NUM_BLOCKS, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
|
|
||||||
)
|
)
|
||||||
|
if q_quant_dtype == FP8_DTYPE:
|
||||||
|
query, _ = to_float8(ref_query)
|
||||||
|
else:
|
||||||
|
query = ref_query
|
||||||
|
|
||||||
kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, page_size, head_dim)
|
kv_lens = torch.randint(0, max_kv_len, (batch_size,), dtype=torch.int32)
|
||||||
kv_cache = torch.randn(size=kv_cache_shape, dtype=dtype)
|
kv_lens[-1] = max_kv_len
|
||||||
|
|
||||||
|
seq_lens = kv_lens + q_lens
|
||||||
|
max_seq_len = torch.max(seq_lens).item()
|
||||||
|
|
||||||
|
# Always using 1.0 scale to reflect the real perf in benchmarking
|
||||||
k_scale = v_scale = 1.0
|
k_scale = v_scale = 1.0
|
||||||
|
ref_kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
|
||||||
|
if kv_quant_dtype == FP8_DTYPE:
|
||||||
|
kv_cache, _ = to_float8(ref_kv_cache)
|
||||||
|
else:
|
||||||
|
kv_cache = ref_kv_cache
|
||||||
|
|
||||||
if kv_cache_dtype.startswith("fp8"):
|
max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
|
||||||
kv_cache, _ = to_float8(kv_cache)
|
block_tables = torch.randint(
|
||||||
|
0, NUM_BLOCKS, (batch_size, max_num_blocks_per_seq), dtype=torch.int32
|
||||||
output_trtllm = torch.empty(q.shape, dtype=dtype)
|
)
|
||||||
|
|
||||||
kv_indptr = [0]
|
kv_indptr = [0]
|
||||||
kv_indices = []
|
kv_indices = []
|
||||||
kv_last_page_lens = []
|
kv_last_page_lens = []
|
||||||
for i in range(num_seqs):
|
for i in range(batch_size):
|
||||||
seq_len = seq_lens[i]
|
seq_len = seq_lens[i]
|
||||||
assert seq_len > 0
|
assert seq_len > 0
|
||||||
num_blocks = (seq_len + page_size - 1) // page_size
|
num_blocks = (seq_len + block_size - 1) // block_size
|
||||||
kv_indices.extend(block_tables[i, :num_blocks])
|
kv_indices.extend(block_tables[i, :num_blocks])
|
||||||
kv_indptr.append(kv_indptr[-1] + num_blocks)
|
kv_indptr.append(kv_indptr[-1] + num_blocks)
|
||||||
kv_last_page_len = seq_len % page_size
|
kv_last_page_len = seq_len % block_size
|
||||||
if kv_last_page_len == 0:
|
if kv_last_page_len == 0:
|
||||||
kv_last_page_len = page_size
|
kv_last_page_len = block_size
|
||||||
kv_last_page_lens.append(kv_last_page_len)
|
kv_last_page_lens.append(kv_last_page_len)
|
||||||
|
|
||||||
kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
|
kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
|
||||||
kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
|
kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
|
||||||
kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
|
kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
|
||||||
|
workspace_buffer = torch.zeros(1024 * 1024 * 1024, dtype=torch.int8)
|
||||||
output_baseline = torch.empty(q.shape, dtype=dtype)
|
|
||||||
|
|
||||||
wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
|
wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
|
||||||
workspace_buffer, kv_layout
|
workspace_buffer, kv_layout
|
||||||
@ -115,12 +132,12 @@ def benchmark_prefill(
|
|||||||
kv_last_page_lens,
|
kv_last_page_lens,
|
||||||
num_qo_heads,
|
num_qo_heads,
|
||||||
num_kv_heads,
|
num_kv_heads,
|
||||||
head_dim,
|
head_size,
|
||||||
page_size,
|
block_size,
|
||||||
causal=True,
|
causal=True,
|
||||||
sm_scale=sm_scale,
|
sm_scale=sm_scale,
|
||||||
q_data_type=dtype,
|
q_data_type=dtype,
|
||||||
kv_data_type=kv_cache.dtype,
|
kv_data_type=dtype,
|
||||||
)
|
)
|
||||||
|
|
||||||
def time_fn(fn, warmup=10, trials=20):
|
def time_fn(fn, warmup=10, trials=20):
|
||||||
@ -138,52 +155,76 @@ def benchmark_prefill(
|
|||||||
times.append(start.elapsed_time(end)) # ms
|
times.append(start.elapsed_time(end)) # ms
|
||||||
return sum(times) / len(times), torch.std(torch.tensor(times))
|
return sum(times) / len(times), torch.std(torch.tensor(times))
|
||||||
|
|
||||||
|
o_scale = 1.0
|
||||||
|
o_sf_scale = None
|
||||||
|
output_baseline = torch.empty(ref_query.shape, dtype=dtype)
|
||||||
|
if o_quant_dtype == FP4_DTYPE:
|
||||||
|
o_sf_scale = 500.0
|
||||||
|
output_trtllm = flashinfer.utils.FP4Tensor(
|
||||||
|
torch.empty(query.shape[:-1] + (query.shape[-1] // 2,), dtype=torch.uint8),
|
||||||
|
torch.empty(
|
||||||
|
(
|
||||||
|
round_up(query.shape[0], 128),
|
||||||
|
round_up(query.shape[1] * query.shape[2] // 16, 4),
|
||||||
|
),
|
||||||
|
dtype=torch.float8_e4m3fn,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
|
||||||
|
|
||||||
def baseline_prefill():
|
def baseline_prefill():
|
||||||
return wrapper.run(
|
return wrapper.run(
|
||||||
q, kv_cache, k_scale=k_scale, v_scale=v_scale, out=output_baseline
|
ref_query,
|
||||||
|
ref_kv_cache,
|
||||||
|
k_scale=k_scale,
|
||||||
|
v_scale=v_scale,
|
||||||
|
out=output_baseline,
|
||||||
)
|
)
|
||||||
|
|
||||||
def trt_prefill():
|
def trtllm_prefill():
|
||||||
return flashinfer.prefill.trtllm_batch_context_with_kv_cache(
|
return flashinfer.prefill.trtllm_batch_context_with_kv_cache(
|
||||||
query=q,
|
query=query,
|
||||||
kv_cache=kv_cache,
|
kv_cache=kv_cache,
|
||||||
workspace_buffer=workspace_buffer,
|
workspace_buffer=workspace_buffer,
|
||||||
block_tables=block_tables,
|
block_tables=block_tables,
|
||||||
seq_lens=seq_lens_tensor,
|
seq_lens=seq_lens,
|
||||||
max_q_len=max_q_len,
|
max_q_len=max_q_len,
|
||||||
max_kv_len=max_seq_len,
|
max_kv_len=max_seq_len,
|
||||||
bmm1_scale=k_scale * sm_scale,
|
bmm1_scale=q_scale * k_scale * sm_scale,
|
||||||
bmm2_scale=v_scale,
|
bmm2_scale=v_scale / o_scale,
|
||||||
batch_size=num_seqs,
|
batch_size=batch_size,
|
||||||
cum_seq_lens_q=q_indptr,
|
cum_seq_lens_q=q_indptr,
|
||||||
cum_seq_lens_kv=kv_indptr,
|
cum_seq_lens_kv=kv_indptr,
|
||||||
|
o_sf_scale=o_sf_scale,
|
||||||
out=output_trtllm,
|
out=output_trtllm,
|
||||||
)
|
)
|
||||||
|
|
||||||
trt_mean, trt_std = time_fn(trt_prefill)
|
|
||||||
baseline_mean, baseline_std = time_fn(baseline_prefill)
|
baseline_mean, baseline_std = time_fn(baseline_prefill)
|
||||||
|
trtllm_mean, trtllm_std = time_fn(trtllm_prefill)
|
||||||
|
|
||||||
# Calculate percentage speedup (positive means TRT is faster)
|
# Calculate percentage speedup (positive means TRT is faster)
|
||||||
speedup_percent = (baseline_mean - trt_mean) / baseline_mean
|
speedup_percent = (baseline_mean - trtllm_mean) / baseline_mean
|
||||||
|
|
||||||
print(
|
print(
|
||||||
f"\t{num_seqs}\t{max_seq_len}\t{trt_mean:.5f}\t{trt_std.item():.5f}"
|
f"\t{batch_size}\t{max_seq_len}\t{trtllm_mean:8.3f}\t{trtllm_std.item():8.3f}"
|
||||||
f"\t{baseline_mean:.5f}\t{baseline_std.item():.5f}\t{speedup_percent:.5f}"
|
f"\t{baseline_mean:8.3f}\t{baseline_std.item():8.3f}\t{speedup_percent:8.3f}"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Return results for CSV writing
|
# Return results for CSV writing
|
||||||
return {
|
return {
|
||||||
"num_seqs": num_seqs,
|
"batch_size": batch_size,
|
||||||
"trt_mean": trt_mean,
|
"trtllm_mean": trtllm_mean,
|
||||||
"trt_std": trt_std.item(),
|
"trtllm_std": trtllm_std.item(),
|
||||||
"baseline_mean": baseline_mean,
|
"baseline_mean": baseline_mean,
|
||||||
"baseline_std": baseline_std.item(),
|
"baseline_std": baseline_std.item(),
|
||||||
"speedup_percent": speedup_percent,
|
"speedup_percent": speedup_percent,
|
||||||
"q_dtype": str(dtype),
|
"q_dtype": str(q_quant_dtype),
|
||||||
"kv_cache_dtype": kv_cache_dtype,
|
"kv_cache_dtype": str(kv_quant_dtype),
|
||||||
"page_size": page_size,
|
"output_dtype": str(o_quant_dtype),
|
||||||
|
"block_size": block_size,
|
||||||
"num_kv_heads": num_kv_heads,
|
"num_kv_heads": num_kv_heads,
|
||||||
"head_dim": head_dim,
|
"head_size": head_size,
|
||||||
"max_seq_len": max_seq_len,
|
"max_seq_len": max_seq_len,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -195,17 +236,18 @@ def write_results_to_csv(results, filename=None):
|
|||||||
filename = f"flashinfer_trtllm_benchmark_{timestamp}.csv"
|
filename = f"flashinfer_trtllm_benchmark_{timestamp}.csv"
|
||||||
|
|
||||||
fieldnames = [
|
fieldnames = [
|
||||||
"num_seqs",
|
"batch_size",
|
||||||
"trt_mean",
|
"trtllm_mean",
|
||||||
"trt_std",
|
"trtllm_std",
|
||||||
"baseline_mean",
|
"baseline_mean",
|
||||||
"baseline_std",
|
"baseline_std",
|
||||||
"speedup_percent",
|
"speedup_percent",
|
||||||
"q_dtype",
|
"q_dtype",
|
||||||
"kv_cache_dtype",
|
"kv_cache_dtype",
|
||||||
"page_size",
|
"output_dtype",
|
||||||
|
"block_size",
|
||||||
"num_kv_heads",
|
"num_kv_heads",
|
||||||
"head_dim",
|
"head_size",
|
||||||
"max_seq_len",
|
"max_seq_len",
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -224,27 +266,43 @@ def write_results_to_csv(results, filename=None):
|
|||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
num_seqs = [1, 4, 8, 16, 32, 64, 128, 256]
|
batch_sizes = [1, 4, 8, 16, 32, 64, 128, 256]
|
||||||
max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
|
max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
|
||||||
all_results = []
|
all_results = []
|
||||||
|
|
||||||
print(
|
dtype = torch.bfloat16
|
||||||
"Running benchmark for q_dtype = bfloat16, kv_cache_dtype: bfloat16, "
|
quant_dtypes = [
|
||||||
"output_dtype: bfloat16"
|
# (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
|
||||||
)
|
(None, None, None),
|
||||||
print(
|
(FP8_DTYPE, FP8_DTYPE, None),
|
||||||
"\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t"
|
(FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
|
||||||
"baseline_std\tspeedup_percent"
|
(FP8_DTYPE, FP8_DTYPE, FP4_DTYPE),
|
||||||
)
|
]
|
||||||
for max_seq_len in max_seq_lens:
|
|
||||||
for bs in num_seqs:
|
for quant_dtype in quant_dtypes:
|
||||||
result = benchmark_prefill(
|
q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtype
|
||||||
bs,
|
q_quant_dtype = q_quant_dtype or dtype
|
||||||
max_seq_len,
|
kv_quant_dtype = kv_quant_dtype or dtype
|
||||||
dtype=torch.bfloat16,
|
o_quant_dtype = o_quant_dtype or dtype
|
||||||
kv_cache_dtype="auto",
|
|
||||||
)
|
print(
|
||||||
all_results.append(result)
|
f"Running benchmark for q_dtype = {q_quant_dtype}, "
|
||||||
|
f"kv_cache_dtype: {kv_quant_dtype}, "
|
||||||
|
f"output_dtype: {o_quant_dtype}"
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
"\tbatch_size\tmax_seq_len\ttrtllm_mean\ttrtllm_std\tbaseline_mean\t"
|
||||||
|
"baseline_std\tspeedup_percent"
|
||||||
|
)
|
||||||
|
for max_seq_len in max_seq_lens:
|
||||||
|
for bs in batch_sizes:
|
||||||
|
result = benchmark_prefill(
|
||||||
|
dtype=dtype,
|
||||||
|
quant_dtypes=quant_dtype,
|
||||||
|
batch_size=bs,
|
||||||
|
max_seq_len=max_seq_len,
|
||||||
|
)
|
||||||
|
all_results.append(result)
|
||||||
|
|
||||||
# Write all results to CSV
|
# Write all results to CSV
|
||||||
write_results_to_csv(all_results)
|
write_results_to_csv(all_results)
|
||||||
|
|||||||
@ -11,13 +11,13 @@ from datetime import datetime
|
|||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import tqdm
|
from tqdm import tqdm
|
||||||
import triton
|
|
||||||
|
|
||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
_w8a8_block_fp8_matmul,
|
_w8a8_block_fp8_matmul,
|
||||||
)
|
)
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
|
from vllm.triton_utils import triton
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
mp.set_start_method("spawn", force=True)
|
mp.set_start_method("spawn", force=True)
|
||||||
@ -56,7 +56,7 @@ def w8a8_block_matmul(
|
|||||||
Bs: The per-block quantization scale for `B`.
|
Bs: The per-block quantization scale for `B`.
|
||||||
block_size: The block size for per-block quantization.
|
block_size: The block size for per-block quantization.
|
||||||
It should be 2-dim, e.g., [128, 128].
|
It should be 2-dim, e.g., [128, 128].
|
||||||
output_dytpe: The dtype of the returned tensor.
|
output_dtype: The dtype of the returned tensor.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
torch.Tensor: The result of matmul.
|
torch.Tensor: The result of matmul.
|
||||||
@ -141,6 +141,7 @@ def get_weight_shapes(tp_size):
|
|||||||
# cannot TP
|
# cannot TP
|
||||||
total = [
|
total = [
|
||||||
(512 + 64, 7168),
|
(512 + 64, 7168),
|
||||||
|
(2112, 7168),
|
||||||
((128 + 64) * 128, 7168),
|
((128 + 64) * 128, 7168),
|
||||||
(128 * (128 + 128), 512),
|
(128 * (128 + 128), 512),
|
||||||
(7168, 16384),
|
(7168, 16384),
|
||||||
|
|||||||
@ -8,12 +8,16 @@ import torch
|
|||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
get_col_major_tma_aligned_tensor,
|
|
||||||
per_token_group_quant_fp8,
|
per_token_group_quant_fp8,
|
||||||
w8a8_block_fp8_matmul,
|
w8a8_triton_block_scaled_mm,
|
||||||
)
|
)
|
||||||
from vllm.triton_utils import triton
|
from vllm.triton_utils import triton
|
||||||
from vllm.utils.deep_gemm import calc_diff, fp8_gemm_nt, per_block_cast_to_fp8
|
from vllm.utils.deep_gemm import (
|
||||||
|
calc_diff,
|
||||||
|
fp8_gemm_nt,
|
||||||
|
get_col_major_tma_aligned_tensor,
|
||||||
|
per_block_cast_to_fp8,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def benchmark_shape(m: int,
|
def benchmark_shape(m: int,
|
||||||
@ -59,7 +63,7 @@ def benchmark_shape(m: int,
|
|||||||
|
|
||||||
# === vLLM Triton Implementation ===
|
# === vLLM Triton Implementation ===
|
||||||
def vllm_triton_gemm():
|
def vllm_triton_gemm():
|
||||||
return w8a8_block_fp8_matmul(A_vllm,
|
return w8a8_triton_block_scaled_mm(A_vllm,
|
||||||
B_vllm,
|
B_vllm,
|
||||||
A_scale_vllm,
|
A_scale_vllm,
|
||||||
B_scale_vllm,
|
B_scale_vllm,
|
||||||
|
|||||||
@ -95,4 +95,10 @@ WEIGHT_SHAPES = {
|
|||||||
([2048, 2816], 1),
|
([2048, 2816], 1),
|
||||||
([1408, 2048], 0),
|
([1408, 2048], 0),
|
||||||
],
|
],
|
||||||
|
"CohereLabs/c4ai-command-a-03-2025": [
|
||||||
|
([12288, 14336], 1),
|
||||||
|
([12288, 12288], 0),
|
||||||
|
([12288, 73728], 1),
|
||||||
|
([36864, 12288], 0),
|
||||||
|
],
|
||||||
}
|
}
|
||||||
|
|||||||
@ -5,11 +5,13 @@ The requirements (pip) for `benchmark_serving_multi_turn.py` can be found in `re
|
|||||||
First start serving your model
|
First start serving your model
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export MODEL_NAME=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
|
export MODEL_PATH=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
|
||||||
|
|
||||||
vllm serve $MODEL_NAME --disable-log-requests
|
vllm serve $MODEL_PATH --served-model-name Llama --disable-log-requests
|
||||||
```
|
```
|
||||||
|
|
||||||
|
The variable `MODEL_PATH` should be a path to the model files (e.g. downloaded from huggingface).
|
||||||
|
|
||||||
## Synthetic Multi-Turn Conversations
|
## Synthetic Multi-Turn Conversations
|
||||||
|
|
||||||
Download the following text file (used for generation of synthetic conversations)
|
Download the following text file (used for generation of synthetic conversations)
|
||||||
@ -26,10 +28,10 @@ But you may use other text files if you prefer (using this specific file is not
|
|||||||
Then run the benchmarking script
|
Then run the benchmarking script
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
export MODEL_NAME=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
|
export MODEL_PATH=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
|
||||||
|
|
||||||
python benchmark_serving_multi_turn.py --model $MODEL_NAME --input-file generate_multi_turn.json \
|
python benchmark_serving_multi_turn.py --model $MODEL_PATH --served-model-name Llama \
|
||||||
--num-clients 2 --max-active-conversations 6
|
--input-file generate_multi_turn.json --num-clients 2 --max-active-conversations 6
|
||||||
```
|
```
|
||||||
|
|
||||||
You can edit the file `generate_multi_turn.json` to change the conversation parameters (number of turns, etc.).
|
You can edit the file `generate_multi_turn.json` to change the conversation parameters (number of turns, etc.).
|
||||||
@ -53,6 +55,107 @@ output_num_chunks 166.0 99.01 11.80 79.00 90.00 98.00 108.75
|
|||||||
----------------------------------------------------------------------------------------------------
|
----------------------------------------------------------------------------------------------------
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### JSON configuration file for synthetic conversations generation
|
||||||
|
|
||||||
|
The input flag `--input-file` is used to determine the input conversations for the benchmark.<br/>
|
||||||
|
When the input is a JSON file with the field `"filetype": "generate_conversations"` the tool will generate synthetic multi-turn (questions and answers) conversations.
|
||||||
|
|
||||||
|
The file `generate_multi_turn.json` is an example file.
|
||||||
|
|
||||||
|
The file must contain the sections `prompt_input` and `prompt_output`.
|
||||||
|
|
||||||
|
The `prompt_input` section must contain `num_turns`, `prefix_num_tokens` and `num_tokens`:
|
||||||
|
|
||||||
|
* `num_turns` - Number of total turns in the conversation (both user & assistant).<br/>
|
||||||
|
The final value will always be rounded to an even number so each user turn has a reply.
|
||||||
|
* `prefix_num_tokens` - Tokens added at the start of only the **first user turn** in a conversation (unique per conversation).
|
||||||
|
* `num_tokens` - Total token length of each **user** message (one turn).
|
||||||
|
|
||||||
|
The `prompt_output` section must contain `num_tokens`:
|
||||||
|
|
||||||
|
* `num_tokens` - Total token length of each **assistant** message (one turn).
|
||||||
|
|
||||||
|
### Random distributions for synthetic conversations generation
|
||||||
|
|
||||||
|
When creating an input JSON file (such as `generate_multi_turn.json`),<br/>
|
||||||
|
every numeric field (such as `num_turns` or `num_tokens`) requires a distribution.<br/>
|
||||||
|
The distribution determines how to randomly sample values for the field.
|
||||||
|
|
||||||
|
The available distributions are listed below.
|
||||||
|
|
||||||
|
**Note:** The optional `max` field (for lognormal, zipf, and poisson) can be used to cap sampled values at an upper bound.</br>
|
||||||
|
Can be used to make sure that the total number of tokens in every request does not exceed `--max-model-len`.
|
||||||
|
|
||||||
|
#### constant
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"distribution": "constant",
|
||||||
|
"value": 500
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
* `value` - the fixed integer value (always returns the same number).
|
||||||
|
|
||||||
|
#### uniform
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"distribution": "uniform",
|
||||||
|
"min": 12,
|
||||||
|
"max": 18
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
* `min` - minimum value (inclusive).
|
||||||
|
* `max` - maximum value (inclusive), should be equal or larger than min.
|
||||||
|
|
||||||
|
#### lognormal
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"distribution": "lognormal",
|
||||||
|
"average": 1000,
|
||||||
|
"max": 5000
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
You can parameterize the lognormal distribution in one of two ways:
|
||||||
|
|
||||||
|
Using the average and optional median ratio:
|
||||||
|
|
||||||
|
* `average` - target average value of the distribution.
|
||||||
|
* `median_ratio` - the ratio of the median to the average; controls the skewness. Must be in the range (0, 1).
|
||||||
|
|
||||||
|
Using the parameters of the underlying normal distribution:
|
||||||
|
|
||||||
|
* `mean` - mean of the underlying normal distribution.
|
||||||
|
* `sigma` - standard deviation of the underlying normal distribution.
|
||||||
|
|
||||||
|
#### zipf
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"distribution": "zipf",
|
||||||
|
"alpha": 1.2,
|
||||||
|
"max": 100
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
* `alpha` - skew parameter (> 1). Larger values produce stronger skew toward smaller integers.
|
||||||
|
|
||||||
|
#### poisson
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"distribution": "poisson",
|
||||||
|
"alpha": 10,
|
||||||
|
"max": 50
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
* `alpha` - expected value (λ). Also the variance of the distribution.
|
||||||
|
|
||||||
## ShareGPT Conversations
|
## ShareGPT Conversations
|
||||||
|
|
||||||
To run with the ShareGPT data, download the following ShareGPT dataset:
|
To run with the ShareGPT data, download the following ShareGPT dataset:
|
||||||
|
|||||||
@ -99,21 +99,105 @@ class PoissonDistribution(Distribution):
|
|||||||
|
|
||||||
class LognormalDistribution(Distribution):
|
class LognormalDistribution(Distribution):
|
||||||
def __init__(
|
def __init__(
|
||||||
self, mean: float, sigma: float, max_val: Optional[int] = None
|
self,
|
||||||
|
mean: Optional[float] = None,
|
||||||
|
sigma: Optional[float] = None,
|
||||||
|
average: Optional[int] = None,
|
||||||
|
median_ratio: Optional[float] = None,
|
||||||
|
max_val: Optional[int] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
self.average = average
|
||||||
|
self.median_ratio = median_ratio
|
||||||
|
self.max_val = max_val
|
||||||
|
|
||||||
|
if average is not None:
|
||||||
|
if average < 1:
|
||||||
|
raise ValueError("Lognormal average must be positive")
|
||||||
|
|
||||||
|
if mean or sigma:
|
||||||
|
raise ValueError(
|
||||||
|
"When using lognormal average, you can't provide mean/sigma"
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.median_ratio is None:
|
||||||
|
# Default value that provides relatively wide range of values
|
||||||
|
self.median_ratio = 0.85
|
||||||
|
|
||||||
|
# Calculate mean/sigma of np.random.lognormal based on the average
|
||||||
|
mean, sigma = self._generate_lognormal_by_median(
|
||||||
|
target_average=self.average, median_ratio=self.median_ratio
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
if mean is None or sigma is None:
|
||||||
|
raise ValueError(
|
||||||
|
"Must provide both mean and sigma if average is not used"
|
||||||
|
)
|
||||||
|
|
||||||
|
if mean <= 0 or sigma < 0:
|
||||||
|
raise ValueError(
|
||||||
|
"Lognormal mean must be positive and sigma must be non-negative"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Mean and standard deviation of the underlying normal distribution
|
||||||
|
# Based on numpy.random.lognormal
|
||||||
self.mean = mean
|
self.mean = mean
|
||||||
self.sigma = sigma
|
self.sigma = sigma
|
||||||
self.max_val = max_val
|
|
||||||
|
@staticmethod
|
||||||
|
def _generate_lognormal_by_median(
|
||||||
|
target_average: int, median_ratio: float
|
||||||
|
) -> tuple[float, float]:
|
||||||
|
"""
|
||||||
|
Compute (mu, sigma) for a lognormal distribution given:
|
||||||
|
- a target average (mean of the distribution)
|
||||||
|
- a ratio of median / mean (controls skewness), assume mean > median
|
||||||
|
|
||||||
|
Background:
|
||||||
|
If Z ~ Normal(mu, sigma^2), then X = exp(Z) ~ LogNormal(mu, sigma).
|
||||||
|
* mean(X) = exp(mu + sigma^2 / 2)
|
||||||
|
* median(X) = exp(mu)
|
||||||
|
|
||||||
|
So:
|
||||||
|
median / mean = exp(mu) / exp(mu + sigma^2 / 2)
|
||||||
|
= exp(-sigma^2 / 2)
|
||||||
|
|
||||||
|
Rearranging:
|
||||||
|
sigma^2 = 2 * ln(mean / median)
|
||||||
|
mu = ln(median)
|
||||||
|
|
||||||
|
This gives a unique (mu, sigma) for any valid mean and median.
|
||||||
|
"""
|
||||||
|
# Check input validity: median must be smaller than mean
|
||||||
|
if median_ratio <= 0 or median_ratio >= 1:
|
||||||
|
raise ValueError("median_ratio must be in range (0, 1)")
|
||||||
|
|
||||||
|
target_median = target_average * median_ratio
|
||||||
|
|
||||||
|
# Solve sigma^2 = 2 * ln(mean / median)
|
||||||
|
sigma = np.sqrt(2 * np.log(target_average / target_median))
|
||||||
|
mu = np.log(target_median)
|
||||||
|
|
||||||
|
return mu, sigma
|
||||||
|
|
||||||
def sample(self, size: int = 1) -> np.ndarray:
|
def sample(self, size: int = 1) -> np.ndarray:
|
||||||
samples = np.random.lognormal(mean=self.mean, sigma=self.sigma, size=size)
|
samples = np.random.lognormal(mean=self.mean, sigma=self.sigma, size=size)
|
||||||
|
|
||||||
|
if self.average is not None:
|
||||||
|
# Scale to average
|
||||||
|
samples *= self.average / samples.mean()
|
||||||
|
|
||||||
if self.max_val:
|
if self.max_val:
|
||||||
samples = np.minimum(samples, self.max_val)
|
samples = np.minimum(samples, self.max_val)
|
||||||
|
|
||||||
return np.round(samples).astype(int)
|
return np.round(samples).astype(int)
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return f"LognormalDistribution[{self.mean}, {self.sigma}]"
|
if self.average:
|
||||||
|
return (
|
||||||
|
f"LognormalDistribution[{self.average}, "
|
||||||
|
f"{self.median_ratio}, {self.max_val}]"
|
||||||
|
)
|
||||||
|
return f"LognormalDistribution[{self.mean}, {self.sigma}, {self.max_val}]"
|
||||||
|
|
||||||
|
|
||||||
class GenConvArgs(NamedTuple):
|
class GenConvArgs(NamedTuple):
|
||||||
@ -173,10 +257,21 @@ def get_random_distribution(
|
|||||||
return PoissonDistribution(conf["alpha"], max_val=max_val)
|
return PoissonDistribution(conf["alpha"], max_val=max_val)
|
||||||
|
|
||||||
elif distribution == "lognormal":
|
elif distribution == "lognormal":
|
||||||
|
max_val = conf.get("max", None)
|
||||||
|
|
||||||
|
if "average" in conf:
|
||||||
|
# Infer lognormal mean/sigma (numpy) from input average
|
||||||
|
median_ratio = conf.get("median_ratio", None)
|
||||||
|
return LognormalDistribution(
|
||||||
|
average=conf["average"], median_ratio=median_ratio, max_val=max_val
|
||||||
|
)
|
||||||
|
|
||||||
|
# Use mean/sigma directly (for full control over the distribution)
|
||||||
verify_field_exists(conf, "mean", section, subsection)
|
verify_field_exists(conf, "mean", section, subsection)
|
||||||
verify_field_exists(conf, "sigma", section, subsection)
|
verify_field_exists(conf, "sigma", section, subsection)
|
||||||
max_val = conf.get("max", None)
|
return LognormalDistribution(
|
||||||
return LognormalDistribution(conf["mean"], conf["sigma"], max_val=max_val)
|
mean=conf["mean"], sigma=conf["sigma"], max_val=max_val
|
||||||
|
)
|
||||||
|
|
||||||
elif distribution == "uniform":
|
elif distribution == "uniform":
|
||||||
verify_field_exists(conf, "min", section, subsection)
|
verify_field_exists(conf, "min", section, subsection)
|
||||||
|
|||||||
@ -825,9 +825,11 @@ def get_client_config(
|
|||||||
|
|
||||||
# Arguments for API requests
|
# Arguments for API requests
|
||||||
chat_url = f"{args.url}/v1/chat/completions"
|
chat_url = f"{args.url}/v1/chat/completions"
|
||||||
|
model_name = args.served_model_name if args.served_model_name else args.model
|
||||||
|
|
||||||
req_args = RequestArgs(
|
req_args = RequestArgs(
|
||||||
chat_url=chat_url,
|
chat_url=chat_url,
|
||||||
model=args.model,
|
model=model_name,
|
||||||
stream=not args.no_stream,
|
stream=not args.no_stream,
|
||||||
limit_min_tokens=args.limit_min_tokens,
|
limit_min_tokens=args.limit_min_tokens,
|
||||||
limit_max_tokens=args.limit_max_tokens,
|
limit_max_tokens=args.limit_max_tokens,
|
||||||
@ -960,7 +962,7 @@ async def main_mp(
|
|||||||
|
|
||||||
# At this point all the clients finished,
|
# At this point all the clients finished,
|
||||||
# collect results (TTFT, TPOT, etc.) from all the clients.
|
# collect results (TTFT, TPOT, etc.) from all the clients.
|
||||||
# This needs to happens before calling join on the clients
|
# This needs to happen before calling join on the clients
|
||||||
# (result_queue should be emptied).
|
# (result_queue should be emptied).
|
||||||
while not result_queue.empty():
|
while not result_queue.empty():
|
||||||
client_metrics.append(result_queue.get())
|
client_metrics.append(result_queue.get())
|
||||||
@ -1247,9 +1249,19 @@ async def main() -> None:
|
|||||||
default=0,
|
default=0,
|
||||||
help="Seed for random number generators (default: 0)",
|
help="Seed for random number generators (default: 0)",
|
||||||
)
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-m", "--model", type=str, required=True, help="Path of the LLM model"
|
"-m", "--model", type=str, required=True, help="Path of the LLM model"
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--served-model-name",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="The model name used in the API. "
|
||||||
|
"If not specified, the model name will be the "
|
||||||
|
"same as the ``--model`` argument. ",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-u",
|
"-u",
|
||||||
"--url",
|
"--url",
|
||||||
|
|||||||
@ -15,9 +15,8 @@
|
|||||||
},
|
},
|
||||||
"prefix_num_tokens": {
|
"prefix_num_tokens": {
|
||||||
"distribution": "lognormal",
|
"distribution": "lognormal",
|
||||||
"mean": 6,
|
"average": 1000,
|
||||||
"sigma": 4,
|
"max": 5000
|
||||||
"max": 1500
|
|
||||||
},
|
},
|
||||||
"num_tokens": {
|
"num_tokens": {
|
||||||
"distribution": "uniform",
|
"distribution": "uniform",
|
||||||
|
|||||||
@ -1,6 +1,7 @@
|
|||||||
include(FetchContent)
|
include(FetchContent)
|
||||||
|
|
||||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||||
|
set(CMAKE_CXX_STANDARD 17)
|
||||||
set(CMAKE_CXX_EXTENSIONS ON)
|
set(CMAKE_CXX_EXTENSIONS ON)
|
||||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||||
|
|
||||||
@ -87,6 +88,7 @@ is_avx512_disabled(AVX512_DISABLED)
|
|||||||
|
|
||||||
if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
|
if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
|
||||||
message(STATUS "Apple Silicon Detected")
|
message(STATUS "Apple Silicon Detected")
|
||||||
|
set(APPLE_SILICON_FOUND TRUE)
|
||||||
set(ENABLE_NUMA OFF)
|
set(ENABLE_NUMA OFF)
|
||||||
check_sysctl(hw.optional.neon ASIMD_FOUND)
|
check_sysctl(hw.optional.neon ASIMD_FOUND)
|
||||||
check_sysctl(hw.optional.arm.FEAT_BF16 ARM_BF16_FOUND)
|
check_sysctl(hw.optional.arm.FEAT_BF16 ARM_BF16_FOUND)
|
||||||
@ -99,6 +101,7 @@ else()
|
|||||||
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
|
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
|
||||||
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
|
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
|
||||||
find_isa(${CPUINFO} "S390" S390_FOUND)
|
find_isa(${CPUINFO} "S390" S390_FOUND)
|
||||||
|
find_isa(${CPUINFO} "v" RVV_FOUND) # Check for RISC-V RVV support
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||||
@ -175,24 +178,30 @@ elseif (S390_FOUND)
|
|||||||
"-mzvector"
|
"-mzvector"
|
||||||
"-march=native"
|
"-march=native"
|
||||||
"-mtune=native")
|
"-mtune=native")
|
||||||
|
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64")
|
||||||
|
if(RVV_FOUND)
|
||||||
|
message(FAIL_ERROR "Can't support rvv now.")
|
||||||
|
else()
|
||||||
|
list(APPEND CXX_COMPILE_FLAGS "-march=rv64gc")
|
||||||
|
endif()
|
||||||
else()
|
else()
|
||||||
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA or ARMv8 support.")
|
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA, ARMv8 or RISC-V support.")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 /ARM platforms)
|
# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 /ARM platforms)
|
||||||
# Flag to enable ACL kernels for AARCH64 platforms
|
# Flag to enable ACL kernels for AARCH64 platforms
|
||||||
if ( VLLM_BUILD_ACL STREQUAL "ON")
|
if (VLLM_BUILD_ACL STREQUAL "ON")
|
||||||
set(USE_ACL ON)
|
set(USE_ACL ON)
|
||||||
else()
|
else()
|
||||||
set(USE_ACL OFF)
|
set(USE_ACL OFF)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND)
|
if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
|
||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
oneDNN
|
oneDNN
|
||||||
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
||||||
GIT_TAG v3.8.1
|
GIT_TAG v3.9
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
GIT_SHALLOW TRUE
|
GIT_SHALLOW TRUE
|
||||||
)
|
)
|
||||||
@ -204,7 +213,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND)
|
|||||||
endif()
|
endif()
|
||||||
set(ONEDNN_AARCH64_USE_ACL "ON")
|
set(ONEDNN_AARCH64_USE_ACL "ON")
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(ONEDNN_LIBRARY_TYPE "STATIC")
|
set(ONEDNN_LIBRARY_TYPE "STATIC")
|
||||||
set(ONEDNN_BUILD_DOC "OFF")
|
set(ONEDNN_BUILD_DOC "OFF")
|
||||||
@ -217,38 +226,23 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR ASIMD_FOUND)
|
|||||||
set(ONEDNN_ENABLE_ITT_TASKS "OFF")
|
set(ONEDNN_ENABLE_ITT_TASKS "OFF")
|
||||||
set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
|
set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
|
||||||
set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
|
set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
|
||||||
|
set(ONEDNN_VERBOSE "OFF")
|
||||||
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
|
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
|
||||||
|
|
||||||
FetchContent_MakeAvailable(oneDNN)
|
FetchContent_MakeAvailable(oneDNN)
|
||||||
|
add_library(dnnl_ext OBJECT "csrc/cpu/dnnl_helper.cpp")
|
||||||
list(APPEND LIBS dnnl)
|
target_include_directories(
|
||||||
elseif(POWER10_FOUND)
|
dnnl_ext
|
||||||
FetchContent_Declare(
|
PUBLIC ${oneDNN_SOURCE_DIR}/include
|
||||||
oneDNN
|
PUBLIC ${oneDNN_BINARY_DIR}/include
|
||||||
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
|
PRIVATE ${oneDNN_SOURCE_DIR}/src
|
||||||
GIT_TAG v3.7.2
|
|
||||||
GIT_PROGRESS TRUE
|
|
||||||
GIT_SHALLOW TRUE
|
|
||||||
)
|
)
|
||||||
|
target_link_libraries(dnnl_ext dnnl)
|
||||||
set(ONEDNN_LIBRARY_TYPE "STATIC")
|
target_compile_options(dnnl_ext PRIVATE ${CXX_COMPILE_FLAGS} -fPIC)
|
||||||
set(ONEDNN_BUILD_DOC "OFF")
|
list(APPEND LIBS dnnl_ext)
|
||||||
set(ONEDNN_BUILD_EXAMPLES "OFF")
|
set(USE_ONEDNN ON)
|
||||||
set(ONEDNN_BUILD_TESTS "OFF")
|
else()
|
||||||
set(ONEDNN_ENABLE_WORKLOAD "INFERENCE")
|
set(USE_ONEDNN OFF)
|
||||||
set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
|
|
||||||
set(ONEDNN_BUILD_GRAPH "OFF")
|
|
||||||
set(ONEDNN_ENABLE_JIT_PROFILING "OFF")
|
|
||||||
set(ONEDNN_ENABLE_ITT_TASKS "OFF")
|
|
||||||
set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
|
|
||||||
set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
|
|
||||||
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
|
|
||||||
|
|
||||||
set(DNNL_CPU_RUNTIME "OMP")
|
|
||||||
|
|
||||||
FetchContent_MakeAvailable(oneDNN)
|
|
||||||
|
|
||||||
list(APPEND LIBS dnnl)
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
|
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
|
||||||
@ -271,11 +265,11 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/cpu/layernorm.cpp"
|
"csrc/cpu/layernorm.cpp"
|
||||||
"csrc/cpu/mla_decode.cpp"
|
"csrc/cpu/mla_decode.cpp"
|
||||||
"csrc/cpu/pos_encoding.cpp"
|
"csrc/cpu/pos_encoding.cpp"
|
||||||
"csrc/cpu/torch_bindings.cpp")
|
"csrc/cpu/torch_bindings.cpp"
|
||||||
|
"csrc/moe/dynamic_4bit_int_moe_cpu.cpp")
|
||||||
|
|
||||||
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
||||||
set(VLLM_EXT_SRC
|
set(VLLM_EXT_SRC
|
||||||
"csrc/cpu/quant.cpp"
|
|
||||||
"csrc/cpu/shm.cpp"
|
"csrc/cpu/shm.cpp"
|
||||||
${VLLM_EXT_SRC})
|
${VLLM_EXT_SRC})
|
||||||
if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
|
if (ENABLE_AVX512BF16 AND ENABLE_AVX512VNNI)
|
||||||
@ -289,14 +283,11 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
|
|||||||
${VLLM_EXT_SRC})
|
${VLLM_EXT_SRC})
|
||||||
add_compile_definitions(-DCPU_CAPABILITY_AVX512)
|
add_compile_definitions(-DCPU_CAPABILITY_AVX512)
|
||||||
endif()
|
endif()
|
||||||
elseif(POWER10_FOUND)
|
|
||||||
set(VLLM_EXT_SRC
|
|
||||||
"csrc/cpu/quant.cpp"
|
|
||||||
${VLLM_EXT_SRC})
|
|
||||||
endif()
|
endif()
|
||||||
if (ASIMD_FOUND)
|
|
||||||
|
if(USE_ONEDNN)
|
||||||
set(VLLM_EXT_SRC
|
set(VLLM_EXT_SRC
|
||||||
"csrc/cpu/quant.cpp"
|
"csrc/cpu/dnnl_kernels.cpp"
|
||||||
${VLLM_EXT_SRC})
|
${VLLM_EXT_SRC})
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|||||||
@ -18,8 +18,8 @@ if(FLASH_MLA_SRC_DIR)
|
|||||||
else()
|
else()
|
||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
flashmla
|
flashmla
|
||||||
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git
|
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA
|
||||||
GIT_TAG 0e43e774597682284358ff2c54530757b654b8d1
|
GIT_TAG 5f65b85703c7ed75fda01e06495077caad207c3f
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
CONFIGURE_COMMAND ""
|
CONFIGURE_COMMAND ""
|
||||||
BUILD_COMMAND ""
|
BUILD_COMMAND ""
|
||||||
@ -33,22 +33,64 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
|
|||||||
# The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
|
# The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
|
||||||
# Only build FlashMLA kernels if we are building for something compatible with
|
# Only build FlashMLA kernels if we are building for something compatible with
|
||||||
# sm90a
|
# sm90a
|
||||||
cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}")
|
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
|
set(SUPPORT_ARCHS)
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3)
|
||||||
|
list(APPEND SUPPORT_ARCHS 9.0a)
|
||||||
|
endif()
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8)
|
||||||
|
list(APPEND SUPPORT_ARCHS 10.0a)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}")
|
||||||
|
if(FLASH_MLA_ARCHS)
|
||||||
|
set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS})
|
||||||
|
list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math")
|
||||||
|
|
||||||
set(FlashMLA_SOURCES
|
set(FlashMLA_SOURCES
|
||||||
${flashmla_SOURCE_DIR}/csrc/flash_api.cpp
|
${flashmla_SOURCE_DIR}/csrc/torch_api.cpp
|
||||||
${flashmla_SOURCE_DIR}/csrc/kernels/splitkv_mla.cu
|
${flashmla_SOURCE_DIR}/csrc/pybind.cpp
|
||||||
${flashmla_SOURCE_DIR}/csrc/kernels/mla_combine.cu
|
${flashmla_SOURCE_DIR}/csrc/smxx/get_mla_metadata.cu
|
||||||
${flashmla_SOURCE_DIR}/csrc/kernels/get_mla_metadata.cu)
|
${flashmla_SOURCE_DIR}/csrc/smxx/mla_combine.cu
|
||||||
|
${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/splitkv_mla.cu
|
||||||
|
${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/splitkv_mla.cu
|
||||||
|
${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/fwd.cu
|
||||||
|
${flashmla_SOURCE_DIR}/csrc/sm100/decode/sparse_fp8/splitkv_mla.cu
|
||||||
|
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_fwd_sm100.cu
|
||||||
|
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_bwd_sm100.cu
|
||||||
|
${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd.cu
|
||||||
|
)
|
||||||
|
|
||||||
|
set(FlashMLA_Extension_SOURCES
|
||||||
|
${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp
|
||||||
|
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp
|
||||||
|
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu
|
||||||
|
)
|
||||||
|
|
||||||
set(FlashMLA_INCLUDES
|
set(FlashMLA_INCLUDES
|
||||||
|
${flashmla_SOURCE_DIR}/csrc
|
||||||
|
${flashmla_SOURCE_DIR}/csrc/sm90
|
||||||
${flashmla_SOURCE_DIR}/csrc/cutlass/include
|
${flashmla_SOURCE_DIR}/csrc/cutlass/include
|
||||||
${flashmla_SOURCE_DIR}/csrc/include)
|
${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
|
||||||
|
)
|
||||||
|
|
||||||
|
set(FlashMLA_Extension_INCLUDES
|
||||||
|
${flashmla_SOURCE_DIR}/csrc
|
||||||
|
${flashmla_SOURCE_DIR}/csrc/sm90
|
||||||
|
${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/
|
||||||
|
${flashmla_SOURCE_DIR}/csrc/cutlass/include
|
||||||
|
${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include
|
||||||
|
)
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${FlashMLA_SOURCES}"
|
SRCS "${FlashMLA_SOURCES}"
|
||||||
CUDA_ARCHS "${FLASH_MLA_ARCHS}")
|
CUDA_ARCHS "${FLASH_MLA_ARCHS}")
|
||||||
|
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${FlashMLA_Extension_SOURCES}"
|
||||||
|
CUDA_ARCHS "${FLASH_MLA_ARCHS}")
|
||||||
|
|
||||||
define_gpu_extension_target(
|
define_gpu_extension_target(
|
||||||
_flashmla_C
|
_flashmla_C
|
||||||
DESTINATION vllm
|
DESTINATION vllm
|
||||||
@ -59,8 +101,32 @@ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
|
|||||||
INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES}
|
INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES}
|
||||||
USE_SABI 3
|
USE_SABI 3
|
||||||
WITH_SOABI)
|
WITH_SOABI)
|
||||||
|
|
||||||
|
# Keep Stable ABI for the module, but *not* for CUDA/C++ files.
|
||||||
|
# This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
|
||||||
|
target_compile_options(_flashmla_C PRIVATE
|
||||||
|
$<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
|
||||||
|
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
|
||||||
|
|
||||||
|
define_gpu_extension_target(
|
||||||
|
_flashmla_extension_C
|
||||||
|
DESTINATION vllm
|
||||||
|
LANGUAGE ${VLLM_GPU_LANG}
|
||||||
|
SOURCES ${FlashMLA_Extension_SOURCES}
|
||||||
|
COMPILE_FLAGS ${VLLM_FLASHMLA_GPU_FLAGS}
|
||||||
|
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
||||||
|
INCLUDE_DIRECTORIES ${FlashMLA_Extension_INCLUDES}
|
||||||
|
USE_SABI 3
|
||||||
|
WITH_SOABI)
|
||||||
|
|
||||||
|
# Keep Stable ABI for the module, but *not* for CUDA/C++ files.
|
||||||
|
# This prevents Py_LIMITED_API from affecting nvcc and C++ compiles.
|
||||||
|
target_compile_options(_flashmla_extension_C PRIVATE
|
||||||
|
$<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API>
|
||||||
|
$<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>)
|
||||||
else()
|
else()
|
||||||
# Create an empty target for setup.py when not targeting sm90a systems
|
# Create empty targets for setup.py when not targeting sm90a systems
|
||||||
add_custom_target(_flashmla_C)
|
add_custom_target(_flashmla_C)
|
||||||
|
add_custom_target(_flashmla_extension_C)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
|||||||
@ -38,7 +38,7 @@ else()
|
|||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
vllm-flash-attn
|
vllm-flash-attn
|
||||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||||
GIT_TAG 57b4e68b9f9d94750b46de8f8dbd2bfcc86edd4f
|
GIT_TAG 4695e6bed5366c41e28c06cd86170166e4f43d00
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
# Don't share the vllm-flash-attn build between build types
|
# Don't share the vllm-flash-attn build between build types
|
||||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||||
|
|||||||
@ -310,13 +310,13 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
|
|||||||
list(REMOVE_DUPLICATES _PTX_ARCHS)
|
list(REMOVE_DUPLICATES _PTX_ARCHS)
|
||||||
list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS)
|
list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS)
|
||||||
|
|
||||||
# if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
|
# If x.0a or x.0f is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
|
||||||
# remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS
|
# remove x.0a or x.0f from SRC_CUDA_ARCHS and add x.0a or x.0f to _CUDA_ARCHS
|
||||||
set(_CUDA_ARCHS)
|
set(_CUDA_ARCHS)
|
||||||
foreach(_arch ${_SRC_CUDA_ARCHS})
|
foreach(_arch ${_SRC_CUDA_ARCHS})
|
||||||
if(_arch MATCHES "\\a$")
|
if(_arch MATCHES "[af]$")
|
||||||
list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
|
list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}")
|
||||||
string(REPLACE "a" "" _base "${_arch}")
|
string(REGEX REPLACE "[af]$" "" _base "${_arch}")
|
||||||
if ("${_base}" IN_LIST TGT_CUDA_ARCHS)
|
if ("${_base}" IN_LIST TGT_CUDA_ARCHS)
|
||||||
list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}")
|
list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}")
|
||||||
list(APPEND _CUDA_ARCHS "${_arch}")
|
list(APPEND _CUDA_ARCHS "${_arch}")
|
||||||
@ -480,7 +480,6 @@ function (define_gpu_extension_target GPU_MOD_NAME)
|
|||||||
${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}")
|
${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17)
|
|
||||||
|
|
||||||
target_compile_options(${GPU_MOD_NAME} PRIVATE
|
target_compile_options(${GPU_MOD_NAME} PRIVATE
|
||||||
$<$<COMPILE_LANGUAGE:${GPU_LANGUAGE}>:${GPU_COMPILE_FLAGS}>)
|
$<$<COMPILE_LANGUAGE:${GPU_LANGUAGE}>:${GPU_COMPILE_FLAGS}>)
|
||||||
|
|||||||
@ -28,10 +28,10 @@
|
|||||||
|
|
||||||
#ifdef USE_ROCM
|
#ifdef USE_ROCM
|
||||||
#include <hip/hip_bf16.h>
|
#include <hip/hip_bf16.h>
|
||||||
#include "../quantization/fp8/amd/quant_utils.cuh"
|
#include "../quantization/w8a8/fp8/amd/quant_utils.cuh"
|
||||||
typedef __hip_bfloat16 __nv_bfloat16;
|
typedef __hip_bfloat16 __nv_bfloat16;
|
||||||
#else
|
#else
|
||||||
#include "../quantization/fp8/nvidia/quant_utils.cuh"
|
#include "../quantization/w8a8/fp8/nvidia/quant_utils.cuh"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
|
|||||||
@ -1,38 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <torch/all.h>
|
|
||||||
|
|
||||||
#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
|
|
||||||
void cutlass_mla_decode_sm100a(torch::Tensor const& out,
|
|
||||||
torch::Tensor const& q_nope,
|
|
||||||
torch::Tensor const& q_pe,
|
|
||||||
torch::Tensor const& kv_c_and_k_pe_cache,
|
|
||||||
torch::Tensor const& seq_lens,
|
|
||||||
torch::Tensor const& page_table, double scale);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
|
|
||||||
torch::Tensor const& q_pe,
|
|
||||||
torch::Tensor const& kv_c_and_k_pe_cache,
|
|
||||||
torch::Tensor const& seq_lens,
|
|
||||||
torch::Tensor const& page_table, double scale) {
|
|
||||||
#if defined ENABLE_CUTLASS_MLA && ENABLE_CUTLASS_MLA
|
|
||||||
return cutlass_mla_decode_sm100a(out, q_nope, q_pe, kv_c_and_k_pe_cache,
|
|
||||||
seq_lens, page_table, scale);
|
|
||||||
#endif
|
|
||||||
TORCH_CHECK_NOT_IMPLEMENTED(false, "No compiled cutlass MLA");
|
|
||||||
}
|
|
||||||
@ -1,225 +0,0 @@
|
|||||||
/*
|
|
||||||
* Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
|
|
||||||
#include <torch/all.h>
|
|
||||||
|
|
||||||
#include <ATen/cuda/CUDAContext.h>
|
|
||||||
#include <c10/cuda/CUDAGuard.h>
|
|
||||||
|
|
||||||
#include "cute/tensor.hpp"
|
|
||||||
|
|
||||||
#include "cutlass/cutlass.h"
|
|
||||||
#include "cutlass/kernel_hardware_info.h"
|
|
||||||
|
|
||||||
#include "cutlass_extensions/common.hpp"
|
|
||||||
|
|
||||||
#include "device/sm100_mla.hpp"
|
|
||||||
#include "kernel/sm100_mla_tile_scheduler.hpp"
|
|
||||||
|
|
||||||
using namespace cute;
|
|
||||||
using namespace cutlass::fmha::kernel;
|
|
||||||
|
|
||||||
template <typename T, bool PersistenceOption = true>
|
|
||||||
struct MlaSm100 {
|
|
||||||
using Element = T;
|
|
||||||
using ElementAcc = float;
|
|
||||||
using ElementOut = T;
|
|
||||||
|
|
||||||
using TileShape = Shape<_128, _128, Shape<_512, _64>>;
|
|
||||||
using TileShapeH = cute::tuple_element_t<0, TileShape>;
|
|
||||||
using TileShapeD = cute::tuple_element_t<2, TileShape>;
|
|
||||||
|
|
||||||
// H K (D_latent D_rope) B
|
|
||||||
using ProblemShape = cute::tuple<TileShapeH, int, TileShapeD, int>;
|
|
||||||
|
|
||||||
using StrideQ = cute::tuple<int64_t, _1, int64_t>; // H D B
|
|
||||||
using StrideK = cute::tuple<int64_t, _1, int64_t>; // K D B
|
|
||||||
using StrideO = StrideK; // H D B
|
|
||||||
using StrideLSE = cute::tuple<_1, int>; // H B
|
|
||||||
|
|
||||||
using TileScheduler =
|
|
||||||
std::conditional_t<PersistenceOption, Sm100MlaPersistentTileScheduler,
|
|
||||||
Sm100MlaIndividualTileScheduler>;
|
|
||||||
|
|
||||||
using FmhaKernel =
|
|
||||||
cutlass::fmha::kernel::Sm100FmhaMlaKernelTmaWarpspecialized<
|
|
||||||
TileShape, Element, ElementAcc, ElementOut, ElementAcc, TileScheduler,
|
|
||||||
/*kIsCpAsync=*/true>;
|
|
||||||
using Fmha = cutlass::fmha::device::MLA<FmhaKernel>;
|
|
||||||
};
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
typename T::Fmha::Arguments args_from_options(
|
|
||||||
at::Tensor const& out, at::Tensor const& q_nope, at::Tensor const& q_pe,
|
|
||||||
at::Tensor const& kv_c_and_k_pe_cache, at::Tensor const& seq_lens,
|
|
||||||
at::Tensor const& page_table, double scale) {
|
|
||||||
cutlass::KernelHardwareInfo hw_info;
|
|
||||||
hw_info.device_id = q_nope.device().index();
|
|
||||||
hw_info.sm_count =
|
|
||||||
cutlass::KernelHardwareInfo::query_device_multiprocessor_count(
|
|
||||||
hw_info.device_id);
|
|
||||||
|
|
||||||
int batches = q_nope.sizes()[0];
|
|
||||||
int page_count_per_seq = page_table.sizes()[1];
|
|
||||||
int page_count_total = kv_c_and_k_pe_cache.sizes()[0];
|
|
||||||
int page_size = kv_c_and_k_pe_cache.sizes()[1];
|
|
||||||
int max_seq_len = page_size * page_count_per_seq;
|
|
||||||
using TileShapeH = typename T::TileShapeH;
|
|
||||||
using TileShapeD = typename T::TileShapeD;
|
|
||||||
auto problem_shape =
|
|
||||||
cute::make_tuple(TileShapeH{}, max_seq_len, TileShapeD{}, batches);
|
|
||||||
|
|
||||||
auto [H, K, D, B] = problem_shape;
|
|
||||||
auto [D_latent, D_rope] = D;
|
|
||||||
|
|
||||||
using StrideQ = typename T::StrideQ;
|
|
||||||
using StrideK = typename T::StrideK;
|
|
||||||
using StrideO = typename T::StrideO;
|
|
||||||
using StrideLSE = typename T::StrideLSE;
|
|
||||||
|
|
||||||
StrideQ stride_Q_latent = cute::make_tuple(
|
|
||||||
static_cast<int64_t>(D_latent), _1{}, static_cast<int64_t>(H * D_latent));
|
|
||||||
StrideQ stride_Q_rope = cute::make_tuple(static_cast<int64_t>(D_rope), _1{},
|
|
||||||
static_cast<int64_t>(H * D_rope));
|
|
||||||
StrideK stride_C =
|
|
||||||
cute::make_tuple(static_cast<int64_t>(D_latent + D_rope), _1{},
|
|
||||||
static_cast<int64_t>(page_size * (D_latent + D_rope)));
|
|
||||||
StrideLSE stride_PT = cute::make_stride(_1{}, page_count_per_seq);
|
|
||||||
StrideLSE stride_LSE = cute::make_tuple(_1{}, static_cast<int>(H));
|
|
||||||
StrideO stride_O = cute::make_tuple(static_cast<int64_t>(D_latent), _1{},
|
|
||||||
static_cast<int64_t>(H * D_latent));
|
|
||||||
|
|
||||||
using Element = typename T::Element;
|
|
||||||
using ElementOut = typename T::ElementOut;
|
|
||||||
using ElementAcc = typename T::ElementAcc;
|
|
||||||
auto Q_latent_ptr = static_cast<Element*>(q_nope.data_ptr());
|
|
||||||
auto Q_rope_ptr = static_cast<Element*>(q_pe.data_ptr());
|
|
||||||
auto C_ptr = static_cast<Element*>(kv_c_and_k_pe_cache.data_ptr());
|
|
||||||
auto scale_f = static_cast<float>(scale);
|
|
||||||
typename T::Fmha::Arguments arguments{
|
|
||||||
problem_shape,
|
|
||||||
{scale_f, Q_latent_ptr, stride_Q_latent, Q_rope_ptr, stride_Q_rope, C_ptr,
|
|
||||||
stride_C, C_ptr + D_latent, stride_C,
|
|
||||||
static_cast<int*>(seq_lens.data_ptr()),
|
|
||||||
static_cast<int*>(page_table.data_ptr()), stride_PT, page_count_total,
|
|
||||||
page_size},
|
|
||||||
{static_cast<ElementOut*>(out.data_ptr()), stride_O,
|
|
||||||
static_cast<ElementAcc*>(nullptr), stride_LSE},
|
|
||||||
hw_info,
|
|
||||||
1, // split_kv
|
|
||||||
nullptr, // is_var_split_kv
|
|
||||||
};
|
|
||||||
// TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
|
|
||||||
// split_kv automatically based on batch size and sequence length to balance
|
|
||||||
// workload across available SMs. Consider using var_split_kv for manual
|
|
||||||
// control if needed.
|
|
||||||
T::Fmha::set_split_kv(arguments);
|
|
||||||
return arguments;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename Element>
|
|
||||||
void runMla(at::Tensor const& out, at::Tensor const& q_nope,
|
|
||||||
at::Tensor const& q_pe, at::Tensor const& kv_c_and_k_pe_cache,
|
|
||||||
at::Tensor const& seq_lens, at::Tensor const& page_table,
|
|
||||||
float scale, cudaStream_t stream) {
|
|
||||||
using MlaSm100Type = MlaSm100<Element>;
|
|
||||||
typename MlaSm100Type::Fmha fmha;
|
|
||||||
auto arguments = args_from_options<MlaSm100Type>(
|
|
||||||
out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens, page_table, scale);
|
|
||||||
size_t workspace_size = MlaSm100Type::Fmha::get_workspace_size(arguments);
|
|
||||||
auto const workspace_options =
|
|
||||||
torch::TensorOptions().dtype(torch::kUInt8).device(q_nope.device());
|
|
||||||
auto workspace = torch::empty(workspace_size, workspace_options);
|
|
||||||
|
|
||||||
CUTLASS_CHECK(fmha.can_implement(arguments));
|
|
||||||
|
|
||||||
CUTLASS_CHECK(fmha.initialize(arguments, workspace.data_ptr(), stream));
|
|
||||||
|
|
||||||
CUTLASS_CHECK(fmha.run(arguments, workspace.data_ptr(), stream));
|
|
||||||
}
|
|
||||||
|
|
||||||
void cutlass_mla_decode_sm100a(torch::Tensor const& out,
|
|
||||||
torch::Tensor const& q_nope,
|
|
||||||
torch::Tensor const& q_pe,
|
|
||||||
torch::Tensor const& kv_c_and_k_pe_cache,
|
|
||||||
torch::Tensor const& seq_lens,
|
|
||||||
torch::Tensor const& page_table, double scale) {
|
|
||||||
TORCH_CHECK(q_nope.device().is_cuda(), "q_nope must be on CUDA");
|
|
||||||
TORCH_CHECK(q_nope.dim() == 3, "q_nope must be a 3D tensor");
|
|
||||||
TORCH_CHECK(q_pe.dim() == 3, "q_pe must be a 3D tensor");
|
|
||||||
TORCH_CHECK(kv_c_and_k_pe_cache.dim() == 3,
|
|
||||||
"kv_c_and_k_pe_cache must be a 3D tensor");
|
|
||||||
TORCH_CHECK(seq_lens.dim() == 1, "seq_lens must be a 1D tensor");
|
|
||||||
TORCH_CHECK(page_table.dim() == 2, "page_table must be a 2D tensor");
|
|
||||||
TORCH_CHECK(out.dim() == 3, "out must be a 3D tensor");
|
|
||||||
|
|
||||||
auto B_q_nope = q_nope.size(0);
|
|
||||||
auto H_q_nope = q_nope.size(1);
|
|
||||||
auto D_q_nope = q_nope.size(2);
|
|
||||||
auto B_q_pe = q_pe.size(0);
|
|
||||||
auto H_q_pe = q_pe.size(1);
|
|
||||||
auto D_q_pe = q_pe.size(2);
|
|
||||||
auto B_pt = page_table.size(0);
|
|
||||||
auto PAGE_NUM = page_table.size(1);
|
|
||||||
auto PAGE_SIZE = kv_c_and_k_pe_cache.size(1);
|
|
||||||
auto D_ckv = kv_c_and_k_pe_cache.size(2);
|
|
||||||
auto B_o = out.size(0);
|
|
||||||
auto H_o = out.size(1);
|
|
||||||
auto D_o = out.size(2);
|
|
||||||
|
|
||||||
TORCH_CHECK(D_q_nope == 512, "D_q_nope must be equal to 512");
|
|
||||||
TORCH_CHECK(D_q_pe == 64, "D_q_pe must be equal to 64");
|
|
||||||
TORCH_CHECK(D_ckv == 576, "D_ckv must be equal to 576");
|
|
||||||
TORCH_CHECK(H_q_nope == H_q_pe && H_q_nope == H_o && H_o == 128,
|
|
||||||
"H_q_nope, H_q_pe, and H_o must be equal to 128");
|
|
||||||
TORCH_CHECK(PAGE_SIZE > 0 && (PAGE_SIZE & (PAGE_SIZE - 1)) == 0,
|
|
||||||
"PAGE_SIZE must be a power of 2");
|
|
||||||
TORCH_CHECK(
|
|
||||||
B_q_nope == B_q_pe && B_q_nope == B_pt && B_q_nope == B_o,
|
|
||||||
"Batch dims must be same for page_table, q_nope and q_pe, and out");
|
|
||||||
TORCH_CHECK(PAGE_NUM % (128 / PAGE_SIZE) == 0,
|
|
||||||
"PAGE_NUM must be divisible by 128 / PAGE_SIZE");
|
|
||||||
TORCH_CHECK(D_o == 512, "D_o must be equal to 512");
|
|
||||||
|
|
||||||
TORCH_CHECK(q_nope.dtype() == at::ScalarType::Half ||
|
|
||||||
q_nope.dtype() == at::ScalarType::BFloat16 ||
|
|
||||||
q_nope.dtype() == at::ScalarType::Float8_e4m3fn,
|
|
||||||
"q_nope must be a half, bfloat16, or float8_e4m3fn tensor");
|
|
||||||
TORCH_CHECK(kv_c_and_k_pe_cache.dtype() == q_nope.dtype() &&
|
|
||||||
q_nope.dtype() == q_pe.dtype(),
|
|
||||||
"kv_c_and_k_pe_cache, q_nope, and q_pe must be the same type");
|
|
||||||
TORCH_CHECK(seq_lens.dtype() == torch::kInt32,
|
|
||||||
"seq_lens must be a 32-bit integer tensor");
|
|
||||||
TORCH_CHECK(page_table.dtype() == torch::kInt32,
|
|
||||||
"page_table must be a 32-bit integer tensor");
|
|
||||||
|
|
||||||
auto in_dtype = q_nope.dtype();
|
|
||||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(q_nope));
|
|
||||||
const cudaStream_t stream =
|
|
||||||
at::cuda::getCurrentCUDAStream(q_nope.get_device());
|
|
||||||
if (in_dtype == at::ScalarType::Half) {
|
|
||||||
runMla<cutlass::half_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache, seq_lens,
|
|
||||||
page_table, scale, stream);
|
|
||||||
} else if (in_dtype == at::ScalarType::BFloat16) {
|
|
||||||
runMla<cutlass::bfloat16_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache,
|
|
||||||
seq_lens, page_table, scale, stream);
|
|
||||||
} else if (in_dtype == at::ScalarType::Float8_e4m3fn) {
|
|
||||||
runMla<cutlass::float_e4m3_t>(out, q_nope, q_pe, kv_c_and_k_pe_cache,
|
|
||||||
seq_lens, page_table, scale, stream);
|
|
||||||
} else {
|
|
||||||
TORCH_CHECK(false, "Unsupported input data type of MLA");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@ -133,6 +133,14 @@ public:
|
|||||||
// printf(" sm_count = %d\n", sm_count);
|
// printf(" sm_count = %d\n", sm_count);
|
||||||
int max_splits = ceil_div(K, 128);
|
int max_splits = ceil_div(K, 128);
|
||||||
max_splits = min(16, max_splits);
|
max_splits = min(16, max_splits);
|
||||||
|
|
||||||
|
// TODO: This avoids a hang when the batch size larger than 1 and
|
||||||
|
// there is more than 1 kv_splits.
|
||||||
|
// Discuss with NVIDIA how this can be fixed.
|
||||||
|
if (B > 1) {
|
||||||
|
max_splits = min(1, max_splits);
|
||||||
|
}
|
||||||
|
|
||||||
// printf(" max_splits = %d\n", max_splits);
|
// printf(" max_splits = %d\n", max_splits);
|
||||||
int sms_per_batch = max(1, sm_count / B);
|
int sms_per_batch = max(1, sm_count / B);
|
||||||
// printf(" sms_per_batch = %d\n", sms_per_batch);
|
// printf(" sms_per_batch = %d\n", sms_per_batch);
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user