mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-28 20:34:35 +08:00
Compare commits
1585 Commits
benchmark-
...
gpu-ids
| Author | SHA1 | Date | |
|---|---|---|---|
| 37cf1f27f2 | |||
| 45ea3c31a2 | |||
| df866cfebf | |||
| 29596317b0 | |||
| 7b86860ff5 | |||
| f386a9e56c | |||
| efe73d0575 | |||
| 853487bc1b | |||
| 9ff2af6d2b | |||
| 70ca5484f5 | |||
| 5358cce5ff | |||
| 2155e95ef1 | |||
| f95570a52d | |||
| b6e7e3d58f | |||
| e760fcef22 | |||
| 6bbf1795b7 | |||
| 9e0ef888f0 | |||
| 97abeb1daa | |||
| 34dad19e7b | |||
| 6db31e7a27 | |||
| 977180c912 | |||
| c40784c794 | |||
| baed180aa0 | |||
| 0b407479ef | |||
| 5eaf570050 | |||
| d8ee5a2ca4 | |||
| b9fca83256 | |||
| 32dffc2772 | |||
| c438183e99 | |||
| baba0389f7 | |||
| c6c22f16d3 | |||
| dd382e0fe3 | |||
| 849590a2a7 | |||
| a4c23314c0 | |||
| b942c094e3 | |||
| b4bab81660 | |||
| b91cb3fa5c | |||
| 71d1d75b7a | |||
| 72d14d0eed | |||
| e34d130c16 | |||
| 7721ef1786 | |||
| 8369b7c2a9 | |||
| 3eb4ad53f3 | |||
| 90a2769f20 | |||
| e60d422f19 | |||
| 0d914c81a2 | |||
| 6e428cdd7a | |||
| 93b9d9f499 | |||
| af107d5a0e | |||
| 31c5d0a1b7 | |||
| afb7cff1b9 | |||
| d2e841a10a | |||
| 14601f5fba | |||
| 042d131f39 | |||
| 8e807cdfa4 | |||
| e601efcb10 | |||
| 22dd9c2730 | |||
| a6d795d593 | |||
| a37d75bbec | |||
| edd270bc78 | |||
| 110df74332 | |||
| 1ad69e8375 | |||
| b8a498c9b2 | |||
| 923147b5e8 | |||
| 45877ef740 | |||
| 6e4bef1bea | |||
| 4ff79a136e | |||
| 448acad31e | |||
| eb0b2d2f08 | |||
| 3112271f6e | |||
| 1fd471e957 | |||
| 2c5ebec064 | |||
| 2e610deb72 | |||
| 6e2c19ce22 | |||
| 47db8c2c15 | |||
| 462b269280 | |||
| c18b3b8e8b | |||
| 9528e3a05e | |||
| 9fb52e523a | |||
| e202dd2736 | |||
| 43813e6361 | |||
| cede942b87 | |||
| fe1e924811 | |||
| 4548c03c50 | |||
| 40b86aa05e | |||
| 432870829d | |||
| f73d02aadc | |||
| c5ebe040ac | |||
| 8d763cb891 | |||
| cf4cd53982 | |||
| 32c9be2200 | |||
| 8aeaa910a2 | |||
| 906e05d840 | |||
| ef9a2990ae | |||
| 7e90870491 | |||
| d3f05c9248 | |||
| c108781c85 | |||
| 3d184b95b8 | |||
| 2f35a022e6 | |||
| ffe00ef77a | |||
| 5561681d04 | |||
| fbd62d8750 | |||
| 2e26f9156a | |||
| 9e5452ee34 | |||
| 0e3fe896e2 | |||
| 1caca5a589 | |||
| 783921d889 | |||
| 4a98edff1f | |||
| a7bab0c9e5 | |||
| 25950dca9b | |||
| a4113b035c | |||
| 7e1665b089 | |||
| 8d1096e7db | |||
| 8d775dd30a | |||
| 78fe77534b | |||
| 2f2fcb31b8 | |||
| 1dba2c4ebe | |||
| 71d6de3a26 | |||
| 536fd33003 | |||
| 619b9f5c7e | |||
| d1b689c445 | |||
| 9854dc9040 | |||
| ff5c60fad8 | |||
| 6f1229f91d | |||
| 1819fbda63 | |||
| 7f0367109e | |||
| fb14d53cf6 | |||
| b024a42e93 | |||
| cb97f2bfc5 | |||
| 359200f6ac | |||
| 220aee902a | |||
| 67d25eca05 | |||
| 363528de27 | |||
| 4ff61ababa | |||
| 0ec3779df7 | |||
| b616f6a53d | |||
| 2e25bb12a8 | |||
| 9965c47d0d | |||
| 059d4cdb49 | |||
| bdb84e26b0 | |||
| 3dd359147d | |||
| 657f2f301a | |||
| a1aafc827a | |||
| 139508a418 | |||
| d265414dbc | |||
| 48fb076cbc | |||
| c1909e7e8c | |||
| b95877509b | |||
| 706ff13224 | |||
| ccbfb1d1c9 | |||
| 9e5552aa13 | |||
| 0c600b9ab6 | |||
| e303dcf523 | |||
| ae9c4d416f | |||
| d853520b3e | |||
| ba51aea65e | |||
| 8452946c06 | |||
| 2e7cbf2d7d | |||
| 7da296be04 | |||
| b205e8467d | |||
| be0cfb2b68 | |||
| 1a03dd496b | |||
| 27b8017636 | |||
| 9ec1e3065a | |||
| 9dae7d46bf | |||
| 7058d7dd5d | |||
| a0389e0554 | |||
| 3be8d312a2 | |||
| 3abfe22154 | |||
| e81fbefe8a | |||
| 9290de5667 | |||
| 7f280d69c9 | |||
| 02cabff207 | |||
| 3d19d47d91 | |||
| 8acb4badee | |||
| 314af8617c | |||
| 0e96cc9b7e | |||
| ecad851cbd | |||
| ed70f3c64f | |||
| 650d5dbd04 | |||
| 9025a9a705 | |||
| c05596f1a3 | |||
| 787b13389e | |||
| 96453cfa83 | |||
| b1c1fe35a5 | |||
| 08d81f1014 | |||
| 6cc1e7d96d | |||
| 9909726d2a | |||
| 22e9d42040 | |||
| 86debab54c | |||
| be250bbc67 | |||
| 27949354fa | |||
| bd5038af07 | |||
| a2f14dc8f9 | |||
| 92ee7baaf9 | |||
| 7151f92241 | |||
| e28533a16f | |||
| 6d42ce8315 | |||
| ded1fb635b | |||
| 97d9524fe9 | |||
| d8cf819a9a | |||
| 551ef1631a | |||
| 2863befce3 | |||
| 2965c99c86 | |||
| 2062c0723d | |||
| 1c50e100a9 | |||
| 3ee56e26be | |||
| 8fe7fc8634 | |||
| e936e401de | |||
| f5dfa07531 | |||
| 022c58b80f | |||
| 19108ef311 | |||
| 5a52f389dd | |||
| 65b1cbb138 | |||
| 6c9837a761 | |||
| 6f2f53a82d | |||
| 7b1895e6ce | |||
| 4d36693687 | |||
| daec9dea6e | |||
| daceac57c7 | |||
| 8615d9776f | |||
| 7b460c25f9 | |||
| f719772281 | |||
| d45417b804 | |||
| a29e62ea34 | |||
| e53be6f00a | |||
| c329ceca6d | |||
| 3c545c0c3b | |||
| e8c3bd2cd1 | |||
| c6c983053d | |||
| aafabaa0d5 | |||
| 94a55c7681 | |||
| aa0dc77ef5 | |||
| 4ab3ac285e | |||
| d1c956dc0f | |||
| dec197e3e5 | |||
| 6e244ae091 | |||
| cd4cfee689 | |||
| e110930680 | |||
| 8b64c895c0 | |||
| 0740e29b66 | |||
| 44d2e6af63 | |||
| 2d7779f888 | |||
| a57d57fa72 | |||
| 71799fd005 | |||
| e9fd658a73 | |||
| 07b8fae219 | |||
| 562308816c | |||
| 04e1642e32 | |||
| b69781f107 | |||
| 0bceac9810 | |||
| 34878a0b48 | |||
| 6393b03986 | |||
| 0907d507bf | |||
| c894c5dc1f | |||
| 1f5d178e9c | |||
| 27c065df50 | |||
| 84c260caeb | |||
| 167aca45cb | |||
| 0567c8249f | |||
| d188913d99 | |||
| 1d7c29f5fe | |||
| 65397e40f5 | |||
| 9502c38138 | |||
| 2582683566 | |||
| 754b00edb3 | |||
| 296ce95d8e | |||
| 2d7620c3eb | |||
| 55c65ab495 | |||
| 2cc2069970 | |||
| 9f0608fc16 | |||
| 4e0db57fff | |||
| c40692bf9a | |||
| 4734704b30 | |||
| 8b8c209e35 | |||
| 23a04e0895 | |||
| 02c97d9a92 | |||
| e795d723ed | |||
| 8359f4c8d8 | |||
| bf5181583f | |||
| c53fec1fcb | |||
| 0f9e7354f5 | |||
| ba7ba35cda | |||
| 015fab8c2f | |||
| f59fc60fb3 | |||
| 879f69bed3 | |||
| 7108934142 | |||
| 3443aaf8dd | |||
| 2273ec322c | |||
| a6c4b87fbc | |||
| 1afa9948f5 | |||
| 0d06b533a0 | |||
| c01d1c5aba | |||
| ead369845d | |||
| c6e3bba8e6 | |||
| 91f7d9d0b6 | |||
| 8619e7158c | |||
| c635c5f744 | |||
| a045b7e89a | |||
| 981eeca41a | |||
| 26d34eb67e | |||
| 53da4cd397 | |||
| 9a3b88328f | |||
| 3014c920da | |||
| 0eed516951 | |||
| ee5ad8d2c5 | |||
| a738dbb2a1 | |||
| 33d5e29be9 | |||
| 4671ac6e2a | |||
| dd2ccf8dde | |||
| a3bc76e4b5 | |||
| e6327c9b3e | |||
| d0132f025d | |||
| 61f4fc5dc6 | |||
| 68aaeb3749 | |||
| c3649e4fee | |||
| 53243e5c42 | |||
| a6e6604d32 | |||
| b82e0f82cb | |||
| 5111642a6f | |||
| 1bcd15edc7 | |||
| 2ebff5b77c | |||
| f17aec0d63 | |||
| 493c275352 | |||
| f39ab2d4bd | |||
| 4a0f7888a3 | |||
| c4cf260677 | |||
| 33d51f599e | |||
| e91386cde1 | |||
| 2c11a29f0b | |||
| c76a506bd6 | |||
| ec0db6f51c | |||
| c305a2109d | |||
| 202c5df935 | |||
| 2bb246b8f7 | |||
| 4c409cabc2 | |||
| 3b1e4c6a23 | |||
| 2c5302fadd | |||
| caa680fd2e | |||
| c3bf9bad11 | |||
| 6f170f11dd | |||
| 8ca81bb069 | |||
| e773a9e1c2 | |||
| 71baf85ae1 | |||
| 79f2f1c2a1 | |||
| 2e3e3c86dc | |||
| 7e8977fcd4 | |||
| f1e840e842 | |||
| 7771d1de88 | |||
| 71d1219545 | |||
| e384f2f108 | |||
| 089a306f19 | |||
| 5e666f72cd | |||
| e3a3e4db46 | |||
| e41bf15cd0 | |||
| 5aa4a015ce | |||
| b6bad3d186 | |||
| ee9a1531aa | |||
| 10d82f9ac5 | |||
| ea10dd9d9e | |||
| ead2110297 | |||
| 01220ce89a | |||
| 6f68c49220 | |||
| 4719460644 | |||
| 466166dcfd | |||
| 1d0ae26c85 | |||
| 6021999573 | |||
| c7b370c603 | |||
| aa20d10a91 | |||
| 2de12be428 | |||
| 83ca9ae47b | |||
| e2148dc5ea | |||
| b1098b4072 | |||
| 799397ee4f | |||
| 4959915089 | |||
| 8d1e89d946 | |||
| 36239f79dd | |||
| dfada85eee | |||
| ed33349738 | |||
| d49adea1f9 | |||
| 14fdd21d39 | |||
| 04fefe7c9a | |||
| 3b523e38d9 | |||
| 16c16301c8 | |||
| 9206d0ff01 | |||
| a89209b78d | |||
| ffacb222cb | |||
| 12575cfa7a | |||
| 8b6e1d639c | |||
| 735a9de71f | |||
| 257ab95439 | |||
| cca91a7a10 | |||
| f04d604567 | |||
| 19a53b2783 | |||
| eccdc8318c | |||
| 5f52a84685 | |||
| d4629dc43f | |||
| 6e9cc73f67 | |||
| c53711bd63 | |||
| dac8cc49f4 | |||
| a44b1c951d | |||
| b447624ee3 | |||
| cda92307c1 | |||
| bf57ccc5c2 | |||
| ffb2cd6b54 | |||
| ca94d7fa00 | |||
| 5a1c2e15d8 | |||
| 4c8f64faa7 | |||
| 93aee29fdb | |||
| 154d063b9f | |||
| ccd7c05089 | |||
| c48c6c4008 | |||
| aed8468642 | |||
| 5c76b9cdaf | |||
| ddfed314f9 | |||
| 5b3ad5ecf2 | |||
| ede5c4ebdf | |||
| 07334959d8 | |||
| 119f683949 | |||
| 0860087aff | |||
| 6bc7b57315 | |||
| 90f9c2eb5c | |||
| 387bdf0ab9 | |||
| 5e5baa91aa | |||
| 836d4ce140 | |||
| c3fec47bb7 | |||
| 1173804dca | |||
| 4d5424029b | |||
| 3e7506975c | |||
| ee35e96ac3 | |||
| dec66d253b | |||
| 8d120701fd | |||
| f40f763f12 | |||
| 26bc46ef89 | |||
| a77aea59fd | |||
| b692e9cd07 | |||
| 367871a469 | |||
| 92183b41f3 | |||
| c6703d1e0d | |||
| a5e7242d5f | |||
| 91b2c17a55 | |||
| 055915e6ce | |||
| 3d330c4c09 | |||
| 0b73736a0d | |||
| ee1531bc38 | |||
| e13945f9dd | |||
| 08500011d3 | |||
| 861a0a0a39 | |||
| bc956b38d0 | |||
| 294fc1e2c9 | |||
| 2db9044ab6 | |||
| 6fa718a460 | |||
| 06be858828 | |||
| d1e34cc9ac | |||
| bd517eb9fe | |||
| d65668b4e8 | |||
| aafbbd981f | |||
| 0f0874515a | |||
| 3597b06a4f | |||
| 1015296b79 | |||
| ce9dc02c93 | |||
| a24cb91600 | |||
| 7e8d97dd3f | |||
| d70bc7c029 | |||
| ce688ad46e | |||
| cefdb9962d | |||
| ace5cdaff0 | |||
| 6458721108 | |||
| bb4a0decef | |||
| c707cfc12e | |||
| 7b3c9ff91d | |||
| c68698b326 | |||
| e3b12667d4 | |||
| e6aab5de29 | |||
| c57bb199b3 | |||
| dba68f9159 | |||
| a3319f4f04 | |||
| 9d880f594d | |||
| 017ef648e9 | |||
| 4b25ab14e2 | |||
| f98548b9da | |||
| 96846bb360 | |||
| b6efafd9e4 | |||
| 1129e2b1ab | |||
| c742438f8b | |||
| 73e2e0118f | |||
| c9280e6346 | |||
| af09b3f0a0 | |||
| 4f6c42fa0a | |||
| dff680001d | |||
| 2e090bd5df | |||
| 1b0b065eb5 | |||
| d5bdf899e4 | |||
| 7e3e74c97c | |||
| 3f6341bf7f | |||
| e5d35d62f5 | |||
| 2f1c19b245 | |||
| 42f52cc95b | |||
| 97a9465bbc | |||
| c7ea0b56cd | |||
| 29fa5cac1c | |||
| b2d9be6f7d | |||
| 04a55612dd | |||
| 89b0f84e17 | |||
| 497a91e9f7 | |||
| 943ffa5703 | |||
| 5c8d34a42c | |||
| 3c8694eabe | |||
| 7484e1fce2 | |||
| a2142f0196 | |||
| 871d6b7c74 | |||
| 29a38f0352 | |||
| a5115f4ff5 | |||
| 68b4a26149 | |||
| b8e809a057 | |||
| 5039ec2336 | |||
| 7c644ab6d5 | |||
| 2d40665fe8 | |||
| 96ada386b7 | |||
| 1e473b3010 | |||
| 2b1e2111b0 | |||
| a45b979d9f | |||
| 3952731e8f | |||
| 77f0d465d0 | |||
| 22c3c0aa4a | |||
| 33f8dba7c6 | |||
| 5241ca50d6 | |||
| da9b523ce1 | |||
| b6553be1bc | |||
| 64a9af5afa | |||
| e4248849ec | |||
| 467bef18a3 | |||
| 5f1ac1e1d1 | |||
| 9368cc90b2 | |||
| 32b3946bb4 | |||
| 6b1391ca7e | |||
| a3f66e75d1 | |||
| 319cb1e351 | |||
| 1efef71645 | |||
| 646d62f636 | |||
| 6cd4ae8acd | |||
| c016047ed7 | |||
| 9af6d22e4c | |||
| 4589b94032 | |||
| cc867be19c | |||
| 3a7cd627a8 | |||
| 8058c91108 | |||
| 7d44c469fe | |||
| 31f58be96a | |||
| ebb2f383b8 | |||
| c1c7dbbeeb | |||
| 5cf2daea9a | |||
| b8089195b4 | |||
| 770e5dcdb8 | |||
| c57c9415b1 | |||
| 01810f9236 | |||
| 59abbd84f9 | |||
| 95a6568b5c | |||
| 0eca5eacd0 | |||
| 12e5829221 | |||
| 3a4d417707 | |||
| 8335667c22 | |||
| e1c4380d4c | |||
| e31ae3de36 | |||
| 2ffb9b6e07 | |||
| cda10fa3e2 | |||
| c123bc33f9 | |||
| b9a1791e2c | |||
| 989dcee981 | |||
| 3d64d366e0 | |||
| eaa2e51088 | |||
| d77f7fb871 | |||
| 2d8476e465 | |||
| 88be823d57 | |||
| 4e4f63ad45 | |||
| d2f0e7e615 | |||
| 122cdca5f6 | |||
| cf02f9b283 | |||
| c4296b1a27 | |||
| 66c508b137 | |||
| 84166fee97 | |||
| 6e0cd10f72 | |||
| e010688f50 | |||
| 441b65d8c7 | |||
| 46ecc57973 | |||
| b6a3a9f76d | |||
| ca27f0f9c1 | |||
| aad30bd306 | |||
| 94ecee6282 | |||
| 8267f9916f | |||
| 7353492a47 | |||
| 7661e92ef8 | |||
| f168b85725 | |||
| da511d54d8 | |||
| 65c69444b1 | |||
| 94870359cd | |||
| 0d49483ea9 | |||
| 90b78ec5f9 | |||
| 91a2ef98ea | |||
| 3da2313d78 | |||
| b61dc5f972 | |||
| f8a1a2d108 | |||
| 3465b87ef8 | |||
| c8134bea15 | |||
| cb6d572e85 | |||
| 87360308b7 | |||
| aa49f14832 | |||
| 9ef9173cfa | |||
| 85e2b7bb13 | |||
| 61059bee40 | |||
| ec89524f50 | |||
| f20f9f063b | |||
| 9bc8bb07cf | |||
| 1aeb925f34 | |||
| 188a4590d8 | |||
| 18093084be | |||
| da40380214 | |||
| 8fc57501d3 | |||
| af7fc84fd2 | |||
| 0678b52251 | |||
| 25b918eee6 | |||
| a408820f2f | |||
| c56ed8bb0e | |||
| 78dcf56cb3 | |||
| b2fac67130 | |||
| 23027e2daf | |||
| c3fd4d669a | |||
| ef3f98b59f | |||
| 7ee2590478 | |||
| 53a5a0ce30 | |||
| d459fae0a2 | |||
| c8dcc15921 | |||
| 8f4ffbd373 | |||
| 5f2cd251d2 | |||
| 02658c2dfe | |||
| 01dc9a76db | |||
| 35cf32df30 | |||
| 8711bc5e68 | |||
| 2669a0d7b5 | |||
| 8e972d9c44 | |||
| 3336c8cfbe | |||
| b124e1085b | |||
| 41aa578428 | |||
| 8d646c2e53 | |||
| 5d6d1adf15 | |||
| 1409ef9134 | |||
| 4555143ea7 | |||
| 52dceb172d | |||
| abd7df2fca | |||
| b712be98c7 | |||
| a8da78eac9 | |||
| 5d96533e22 | |||
| 4de790fcad | |||
| b5fd9506c1 | |||
| 135cf55cd1 | |||
| 6cac54f4d1 | |||
| 6865fe0074 | |||
| e31446b6c8 | |||
| bdf13965ab | |||
| fa98d77773 | |||
| 01eee40536 | |||
| 19bdaf32b1 | |||
| 02f0c7b220 | |||
| d054da1992 | |||
| 4b7817c119 | |||
| d00dd65cd4 | |||
| d81edded69 | |||
| 476844d44c | |||
| 4e68ae5e59 | |||
| 4e88723f32 | |||
| 118ff92111 | |||
| ec2dcd80bc | |||
| 42243fbda0 | |||
| 6d18ed2a2e | |||
| f32fcd9444 | |||
| d32aa2e670 | |||
| cc977286e7 | |||
| 17430e3653 | |||
| 1282bd812e | |||
| bdce64f236 | |||
| 9e6f61e8c3 | |||
| 8655f47f37 | |||
| 4ce42f9204 | |||
| 8a57872b2a | |||
| 5bc1ad6cee | |||
| 9112b443a0 | |||
| c57d577e8d | |||
| ca2f6b9c30 | |||
| 20133cfee2 | |||
| ebb1ec9318 | |||
| 5b168b6d7a | |||
| 9760fd8f6a | |||
| b9f61e1387 | |||
| d6fd3a33b8 | |||
| 432ec9926e | |||
| 2b102d51ad | |||
| aa54a7bf7b | |||
| 2ad6194a02 | |||
| c594cbf565 | |||
| a35ca765a5 | |||
| 6aa8f9a4e7 | |||
| 1bc86a3da1 | |||
| bbfa0c61d1 | |||
| 20079c6e36 | |||
| 9a1b9b99d7 | |||
| 8bf507d766 | |||
| 306d60401d | |||
| f2c3f66d59 | |||
| 0f5e0d567e | |||
| c55d804672 | |||
| 749f5bdd38 | |||
| 2a50ef5760 | |||
| b8b904795d | |||
| ba5111f237 | |||
| 1e123529d7 | |||
| dff80b0e42 | |||
| 7782464a17 | |||
| 0f71e24034 | |||
| 1dab4d5718 | |||
| 7f21e8052b | |||
| 5a8641638a | |||
| f49239cb45 | |||
| 2dbe8c0774 | |||
| 84ec470fca | |||
| b29ca5c4d5 | |||
| ec6833c5e9 | |||
| e1fadf1197 | |||
| 43ff405b90 | |||
| fba02e3bd1 | |||
| 4577fc9abb | |||
| 5f1d0c8118 | |||
| c3bb9f2331 | |||
| 8f8900cee9 | |||
| 6acb7a6285 | |||
| 4f4a6b844a | |||
| 4d0a1541be | |||
| 77b6e74fe2 | |||
| 5acf828d99 | |||
| 3987e2ae96 | |||
| 77164dad5e | |||
| 3de3eadf5b | |||
| 3132290a14 | |||
| 1aa2f81b43 | |||
| d54af615d5 | |||
| a1cc9f33a3 | |||
| a521ef06e5 | |||
| 64eaf5fe05 | |||
| d1d61f3351 | |||
| 32ce3cf7c9 | |||
| d58f9c7f7a | |||
| c29034037d | |||
| 1b7cfd5a36 | |||
| da4b69d0b4 | |||
| c9479b2920 | |||
| 6f2909405e | |||
| b169d5f7b6 | |||
| f8977c233f | |||
| f274581f44 | |||
| 0b1447f890 | |||
| 24d0ef8970 | |||
| 7fcfd954ff | |||
| e740d07f07 | |||
| a652e71dd0 | |||
| 34d6c447c4 | |||
| 972eddf7c9 | |||
| fd7bb88d72 | |||
| 3c49dbdd03 | |||
| 1661a9c28f | |||
| 8e882ffdc0 | |||
| 26b4fa45be | |||
| 515b413ebf | |||
| 269d901734 | |||
| 7951d78738 | |||
| 6dbe5b5c93 | |||
| 643622ba46 | |||
| a09c7ca9f2 | |||
| 0e98964e94 | |||
| c68b5c63eb | |||
| fced756923 | |||
| 321331b8ae | |||
| 6e4cea1cc5 | |||
| 435fa95444 | |||
| 4c2b38ce9e | |||
| d781930f90 | |||
| ce75efeecb | |||
| aa42561e40 | |||
| de65fc8e1e | |||
| 0c492b7824 | |||
| 0f0926b43f | |||
| 7f2c1a87e9 | |||
| b78f844a67 | |||
| 5e13c07d00 | |||
| 774c5fde30 | |||
| 9a21e331ff | |||
| 3e9ce609bd | |||
| 794ae1f551 | |||
| d73a9457a5 | |||
| a3896c7f02 | |||
| 51e98e4ffd | |||
| e56f44d9ec | |||
| e0cbad4e30 | |||
| b48d5cca16 | |||
| 5873877241 | |||
| 696259ca01 | |||
| 6b6d496114 | |||
| aaa4ac1c95 | |||
| 06a0338015 | |||
| 4318c0559d | |||
| a68e293cb9 | |||
| 6881107948 | |||
| e0f0ff87b8 | |||
| c24b1572ac | |||
| 4693a3438c | |||
| bbd9a84dc5 | |||
| a547aeb828 | |||
| fc6d0c290f | |||
| 753944fa9b | |||
| 25a817f202 | |||
| d260f799a9 | |||
| b50602d5f0 | |||
| 1f1b1bc03b | |||
| 1f88dbd2bb | |||
| 0eebd74842 | |||
| 27bebcd897 | |||
| e7523c2e03 | |||
| a869baca73 | |||
| 82e2339b06 | |||
| 9553fdb41e | |||
| 243eb9199f | |||
| 0665e29998 | |||
| e76be06550 | |||
| 0877750029 | |||
| 6d68030f1c | |||
| 5a2c76cbe1 | |||
| 38b13dfe78 | |||
| 61a45e7a72 | |||
| 65523a0995 | |||
| 4b7740a105 | |||
| 4ea62c0ea0 | |||
| 561b77a0d6 | |||
| abd4030d94 | |||
| 8820821b59 | |||
| fba0642704 | |||
| 6071e989df | |||
| 57fd13a707 | |||
| 3a886bd58c | |||
| 35be8fad62 | |||
| f2faac745d | |||
| 279f854519 | |||
| 624b77a2b3 | |||
| 503f8487c2 | |||
| 44073a7ac3 | |||
| 63934543a0 | |||
| 75f81750f3 | |||
| 6ab681bcbe | |||
| cebc22f3b6 | |||
| 6c6dcd8611 | |||
| 7891fdf0c6 | |||
| 6825d9a998 | |||
| b554ab736e | |||
| 9ea7f1abf3 | |||
| 2807271c86 | |||
| b9018a3f9f | |||
| 4ceafb6299 | |||
| 2e6705784f | |||
| 1cb194a018 | |||
| 2cd4d58df4 | |||
| 6d166a8d35 | |||
| ef1dd6870f | |||
| e77dc4bad8 | |||
| 07458a51ce | |||
| c1e4a4052d | |||
| a859320575 | |||
| 441dc63ac7 | |||
| d55e446d13 | |||
| ec82c3e388 | |||
| 45ab403a1f | |||
| 2b10ba7491 | |||
| 4fc1bf813a | |||
| f2036734fb | |||
| 7d9216495c | |||
| 0ddf88e16e | |||
| 1645b60196 | |||
| 2628a69e35 | |||
| 371f7e4ca2 | |||
| 15b45ffb9a | |||
| 273cb3b4d9 | |||
| 8ddd1cf26a | |||
| 6550114c9c | |||
| 9520a989df | |||
| 3d28ad343f | |||
| 6a7988c55b | |||
| 022d8abe29 | |||
| 5221815a00 | |||
| 1068556b2c | |||
| 2cd1fa4556 | |||
| d4c2919760 | |||
| 6220f3c6b0 | |||
| 52fb23f47e | |||
| 6dd51c7ef1 | |||
| 2edb533af2 | |||
| 38a95cb4a8 | |||
| cd821ea5d2 | |||
| 7ab056c273 | |||
| 6526e05111 | |||
| e493e48524 | |||
| 4ce64e2df4 | |||
| fbb13a2c15 | |||
| a1fe24d961 | |||
| d0bc2f810b | |||
| b046cf792d | |||
| 54af915949 | |||
| 71ea614d4a | |||
| 4c611348a7 | |||
| 60cad94b86 | |||
| 9c1baa5bc6 | |||
| 4be2255c81 | |||
| ed5d408255 | |||
| 583507d130 | |||
| e44d8ce8c7 | |||
| 93ecb8139c | |||
| fae453f8ce | |||
| 4b0da7b60e | |||
| c6b636f9fb | |||
| 04eb88dc80 | |||
| 46791e1b4b | |||
| c32e249a23 | |||
| c91fe7b1b9 | |||
| a04720bc36 | |||
| 7b9d832c80 | |||
| 6e588da0f4 | |||
| f8d2cc5f55 | |||
| 721fb9b181 | |||
| 1f3a1200e4 | |||
| 54631f8262 | |||
| cb506ecb5a | |||
| 93f71673ce | |||
| 3f505233fd | |||
| 4e04eceb58 | |||
| 71075029f2 | |||
| ca86a7cf6e | |||
| a35a494745 | |||
| f6037d1907 | |||
| fa72f9a812 | |||
| ebed81fbf5 | |||
| e2d7d31244 | |||
| 23b67b37b2 | |||
| db5a29ba19 | |||
| 51797775c3 | |||
| cf5984b2fe | |||
| d022115cc6 | |||
| acb54ca8e1 | |||
| 6e0fd34d3c | |||
| 176d62e4ea | |||
| 20bd6f4d2e | |||
| 1f079540db | |||
| 94d8ec8d2b | |||
| bb0a311213 | |||
| dd5fa7e04f | |||
| 2b16104557 | |||
| 371376f996 | |||
| c6c10ca920 | |||
| c154d89306 | |||
| eca18691d2 | |||
| 61acfc45bc | |||
| 107f5fc4cb | |||
| 907f935de9 | |||
| 5d7f545204 | |||
| cd8dfc6dfc | |||
| d06dd72ba9 | |||
| ad0012a0ac | |||
| 92247c522e | |||
| 0c15c2e486 | |||
| 3b17ea26e4 | |||
| 23baa2180b | |||
| 980a172474 | |||
| e1f5a71ed7 | |||
| f4a8a37465 | |||
| 8f55962a7f | |||
| be48360c1f | |||
| 86847700d7 | |||
| d6c86d09ae | |||
| 6b35cb10a0 | |||
| 1b1e8e05ff | |||
| bca55b556f | |||
| d981396778 | |||
| 9609327fa4 | |||
| f07a673eb2 | |||
| d565e0976f | |||
| 258bf621d5 | |||
| dc1440cf9f | |||
| 8171221834 | |||
| 7937c2fd52 | |||
| e2ee1e8e9e | |||
| 20d8ce81eb | |||
| 84ab4feb7e | |||
| 6781af5608 | |||
| 1b15df2546 | |||
| 43b5f61dce | |||
| c5bb0ebdc6 | |||
| d637b96099 | |||
| 275c5daeb0 | |||
| 47fda6d089 | |||
| 27d0952600 | |||
| 221cfc2fea | |||
| 9da1095daf | |||
| d1211f8794 | |||
| b6a6e7a529 | |||
| 4fb349f66a | |||
| 908733aca7 | |||
| 1a8f68bb90 | |||
| 9ab2c02ff8 | |||
| 66e63e86ec | |||
| 9214e60631 | |||
| f880d42582 | |||
| dcfe95234c | |||
| 48ac2bed5b | |||
| 3e0d435027 | |||
| 4ee4826ede | |||
| 60017dc841 | |||
| 55f1a468d9 | |||
| fd195b194e | |||
| fabe89bbc4 | |||
| e73b7dfd69 | |||
| 7fdfa01530 | |||
| aef94c6d07 | |||
| 0ceaebf87b | |||
| 1db4f47f81 | |||
| d3d91b6f71 | |||
| 87d871470d | |||
| a5f8c111c2 | |||
| e23564cb70 | |||
| 390ec88905 | |||
| 541817670c | |||
| 67da5720d4 | |||
| 5c04bb8b86 | |||
| 3d2779c29a | |||
| 6b31c84aff | |||
| b18201fe06 | |||
| f4937a51c1 | |||
| ee659e3b60 | |||
| 4e1c6a0264 | |||
| c7852a6d9b | |||
| 8795eb9975 | |||
| 0b34593017 | |||
| e3f3aee6f4 | |||
| 92540529c0 | |||
| fadb8d5c2d | |||
| 2aa5470ac5 | |||
| 51ff154639 | |||
| 566ec04c3d | |||
| 01c22335ba | |||
| 451da4bcbd | |||
| 07ad27121f | |||
| a9944aabfa | |||
| a8f5aec20a | |||
| de71fec81b | |||
| 70f8b96724 | |||
| dd2a94596a | |||
| 420caf7557 | |||
| 4f07a64075 | |||
| e6b8e65d2d | |||
| 26d0419309 | |||
| 83f74c698f | |||
| 2dff093574 | |||
| afe3236e90 | |||
| 65334ef3b9 | |||
| e60f550b38 | |||
| f25e0d1125 | |||
| 09f106a91e | |||
| 2142035b51 | |||
| 78aa341d12 | |||
| 7974736740 | |||
| 2fc9075b82 | |||
| d93c976a0d | |||
| 749f792553 | |||
| 856865008e | |||
| f9c069c85e | |||
| 418d2f8bfb | |||
| 964472b966 | |||
| 59dd311cf5 | |||
| d066e52013 | |||
| c8ea982d9b | |||
| dc372b9c8a | |||
| 9b5b39b650 | |||
| 9ccc6ded42 | |||
| d62a076e84 | |||
| 259127f8b8 | |||
| 612c2edb4f | |||
| 38fe728d60 | |||
| 82e7f9bb03 | |||
| 63dc3426e0 | |||
| 8f5dc41481 | |||
| 63ad622233 | |||
| e7ef61c1f0 | |||
| d4154c35a2 | |||
| 6685890d11 | |||
| 33011318c2 | |||
| 4f8b373225 | |||
| 7b2f28deba | |||
| 2d912fb66f | |||
| 12e6c0b41c | |||
| 9a2a6357de | |||
| 6266c57bae | |||
| 754b699cbe | |||
| 6e27c6d86b | |||
| d5af47a149 | |||
| 65f0f74b66 | |||
| 176a95c670 | |||
| f2ae883b67 | |||
| 40de1ef455 | |||
| 0189a65a2e | |||
| 55aa7af994 | |||
| 0b217da646 | |||
| 19324d660c | |||
| fc407a1425 | |||
| 009d9e7590 | |||
| b922c2ebd2 | |||
| 00b14e0f16 | |||
| 54e467e6f8 | |||
| 79a1d25bbd | |||
| 9944011b30 | |||
| 8c946cecca | |||
| ff334ca1cd | |||
| 6223dd8114 | |||
| 906f0598fc | |||
| cb528d0585 | |||
| 98fcba1575 | |||
| 23b3134eb5 | |||
| ea6ae8cb45 | |||
| 2ff297dce9 | |||
| 8dd0671bac | |||
| f0d610a8ae | |||
| e57e4d6e9e | |||
| ee5be834e7 | |||
| 48545728d8 | |||
| dc1a821768 | |||
| 61e0a506a3 | |||
| 1df491c522 | |||
| d8487ef557 | |||
| c06af9a959 | |||
| 60f7624334 | |||
| f6518b2b48 | |||
| d67085c2c8 | |||
| 307939f299 | |||
| 9d7ea9dbbf | |||
| acee8f48aa | |||
| f065de4e88 | |||
| dc9905368d | |||
| ebab1ac37c | |||
| 2b0db9b0e2 | |||
| 195adb47c0 | |||
| 302f3aca7e | |||
| e9c730c9bd | |||
| 289199feb6 | |||
| b9fd0d7a69 | |||
| 72a3f6b898 | |||
| 98ea35601c | |||
| d19110204c | |||
| 05a4324f8e | |||
| 7ea6cb28b2 | |||
| 9fbf2bfbd5 | |||
| 3a5ea75129 | |||
| 891b9d33de | |||
| 430783018c | |||
| 19a3c78d1f | |||
| ada50aa295 | |||
| 08bf784078 | |||
| d45fe333fb | |||
| 021c16c7ca | |||
| 7de18d541b | |||
| a810b5b088 | |||
| 009b3d5382 | |||
| e4b8713380 | |||
| 06c0922a69 | |||
| cd3edfc908 | |||
| 9cea90eab4 | |||
| d1110f5b5a | |||
| 8132365b74 | |||
| eea22a56ab | |||
| 9112155283 | |||
| 90d0a74b60 | |||
| d74e5f37bc | |||
| ca66a1674c | |||
| 950751a987 | |||
| 4c31218f80 | |||
| 68311891f5 | |||
| fc4441a4ee | |||
| 246e3e0a36 | |||
| 7042cc96b0 | |||
| 0c0fdae84f | |||
| 3b602cdea7 | |||
| 4b2ed7926a | |||
| 7e3571134f | |||
| ea2236bf95 | |||
| 7d4aedae7c | |||
| 22481fbfa3 | |||
| 5c4c08f6f1 | |||
| c44c384b1c | |||
| 85b72cb7b1 | |||
| 6e5595ca39 | |||
| 200da9a517 | |||
| 9f64e93415 | |||
| ec61ea20a8 | |||
| c6798baa9c | |||
| 5b2dcbf0b8 | |||
| 6e4a93e3f7 | |||
| 217db4baa6 | |||
| ff8c400502 | |||
| 89a0315f4c | |||
| 3d1e387652 | |||
| d310e6de98 | |||
| 5e6f939484 | |||
| 760e3ecc8f | |||
| 3c9396a64f | |||
| 376786fac1 | |||
| 4f605a6de5 | |||
| 8342e3abd1 | |||
| a83a0f92b5 | |||
| 226a4272cf | |||
| ec54d73c31 | |||
| a944f8ede7 | |||
| 015815fe01 | |||
| e4ca6e3a99 | |||
| 53d0cb7423 | |||
| f50dcb7c21 | |||
| a1e19b635d | |||
| bb239a730f | |||
| a463555dee | |||
| ca04b97c93 | |||
| 0a9bbaa104 | |||
| 39956efb3f | |||
| 597051e56f | |||
| 96722aa81d | |||
| 843b222723 | |||
| e515668edf | |||
| 5a499e70d5 | |||
| 6930a41116 | |||
| 998eea4a0e | |||
| c747d84576 | |||
| b2da14a05a | |||
| 7ea2adb802 | |||
| 3d13ca0e24 | |||
| 66ab3b13c9 | |||
| a8238bbdb0 | |||
| d43f914d42 | |||
| ed5272cf21 | |||
| c20ef40fd0 | |||
| db593aa67f | |||
| f98e307588 | |||
| 646a31e51e | |||
| be8ff88e66 | |||
| 1a6af1453d | |||
| 32aa74c09c | |||
| 7377dd0307 | |||
| 98c89e16ff | |||
| 324a3119b0 | |||
| 8a15c2603a | |||
| 043e4c4955 | |||
| ba7703e659 | |||
| f80ae5bdcf | |||
| 1a45a61387 | |||
| c3e9d5060e | |||
| 822de7fb94 | |||
| 8d84d836d1 | |||
| 950b71186f | |||
| e50a1f1a9c | |||
| a17cef70ea | |||
| 18dd5e01f2 | |||
| 6de3e13413 | |||
| ed3a1d2106 | |||
| 022afbeb4e | |||
| 2f925e5777 | |||
| de906b95f9 | |||
| d456aea71f | |||
| 621ca2c0ab | |||
| 6115b11582 | |||
| 5b8c390747 | |||
| 7525d5f3d5 | |||
| aabcd2cae3 | |||
| 0d115460a7 | |||
| 175bda67a1 | |||
| cba31c47c4 | |||
| a6fed02068 | |||
| d419aa5dc4 | |||
| f9bc5a0693 | |||
| 05e1f96419 | |||
| 6eae34533a | |||
| 63ced7b43f | |||
| dc47ba32f8 | |||
| edbf2d609e | |||
| 999328be0d | |||
| 98834fefaa | |||
| 90bd2ae172 | |||
| 5941e0b7ea | |||
| 9765940824 | |||
| 5ea5c514da | |||
| d3efde8176 | |||
| aea302be6c | |||
| cc05b90d86 | |||
| 1d0c9d6b2d | |||
| f62cad6431 | |||
| 5394ad7387 | |||
| 68e1ee0072 | |||
| 2858830c39 | |||
| d6484ef3c3 | |||
| 46fae69cf0 | |||
| f66f1e0fa3 | |||
| 887d7af882 | |||
| a92842454c | |||
| c8386fa61d | |||
| 87baebebd8 | |||
| e3d0a1d190 | |||
| d47b605eca | |||
| 22c6f6397f | |||
| 3ec97e2cc5 | |||
| 9b103a1d76 | |||
| b90b0852e9 | |||
| 9352cdb56d | |||
| 182f40ea8b | |||
| 3e887d2e0c | |||
| 0f87d8f7b2 | |||
| 4c33d67321 | |||
| cb234955df | |||
| 3a500cd0b6 | |||
| 868c546da4 | |||
| 99404f53c7 | |||
| 785d75a03b | |||
| 6d1479ca4b | |||
| b8b0859b5c | |||
| d7543862bd | |||
| c777df79f7 | |||
| cc2a77d7f1 | |||
| 9e2de9b9e9 | |||
| 109e15a335 | |||
| f192ca90e6 | |||
| f89d0e11bf | |||
| b4003d11fc | |||
| 292fc59d61 | |||
| afcb3f8863 | |||
| afb12e4294 | |||
| 24aebae177 | |||
| 39c0813a7f | |||
| 9b70e2b4c1 | |||
| 173daac19d | |||
| 04f2cfc894 | |||
| 811a6c0972 | |||
| 9b1769dd9a | |||
| 61c299f81f | |||
| 4acfa3354a | |||
| 88c8304104 | |||
| 6768ff4a22 | |||
| f2e7af9b86 | |||
| 7423cf0a9b | |||
| 460a2b1100 | |||
| 28566d73b3 | |||
| 98060b001d | |||
| f5a3c655b2 | |||
| 7169f87ad0 | |||
| b74d888c63 | |||
| 2007d4d54f | |||
| 48e925fab5 | |||
| 1903c0b8a3 | |||
| 86a1f67a3b | |||
| a257d9bccc | |||
| 015069b017 | |||
| fbefc8a78d | |||
| 26bc4bbcd8 | |||
| 3c3d767201 | |||
| 13cf6b6236 | |||
| 90d0a54c4d | |||
| 7a0a146c54 | |||
| 7ab643e425 | |||
| afb4429b4f | |||
| aa4502e7f3 | |||
| 17b4d85f63 | |||
| 1144a8efe7 | |||
| 08fb5587b4 | |||
| dbc18e7816 | |||
| 02bd654846 | |||
| 200bbf92e8 | |||
| 81ecf425f0 | |||
| 42d9a2c4c7 | |||
| 2ac74d098e | |||
| 584f5fb4c6 | |||
| d586ddc691 | |||
| 0b7e701dd4 | |||
| 947f2f5375 | |||
| 739e03b344 | |||
| da4e7687b5 | |||
| 39317cf42b | |||
| 2990cee95b | |||
| 0be6d05b5e | |||
| 77073c77bc | |||
| a7d5b016bd | |||
| d803786731 | |||
| 1534d389af | |||
| ece5a8b0b6 | |||
| 54072f315f | |||
| be633fba0f | |||
| ed6cfb90c8 | |||
| 6ed9f6047e | |||
| a44c4f1d2f | |||
| 88fcf00dda | |||
| d1f569b1b9 | |||
| 13698db634 | |||
| 2c4f59afc3 | |||
| 1c2bc7ead0 | |||
| 4055130a85 | |||
| 34120f5acd | |||
| 7489ec0bab | |||
| 70788bdbdc | |||
| c9c1b59e59 | |||
| 0350809f3a | |||
| a6977dbd15 | |||
| 2fa2a50bf9 | |||
| 08e15defa9 | |||
| b37685afbb | |||
| 792595b59d | |||
| 0c1c788312 | |||
| 56d64fbe30 | |||
| 608968b7c5 | |||
| 06ffc7e1d3 | |||
| d3cf61b89b | |||
| a39203f99e | |||
| 24e6ad3f16 | |||
| 2ef5d106bb | |||
| 0ed27ef66c | |||
| 900edfa8d4 | |||
| 88ad9ec6b2 | |||
| 40896bdf3f | |||
| 00ee37efa2 | |||
| 890f104cdf | |||
| 4a5e13149a | |||
| 97cc8729f0 | |||
| 4464109219 | |||
| 193e78e35d | |||
| bdb2cddafc | |||
| ebb3930d28 | |||
| cde384cd92 | |||
| 96e06e3cb7 | |||
| 17eb306fcc | |||
| 165cb56329 | |||
| d6da8a8ff2 | |||
| b4ac4fa04d | |||
| e136000595 | |||
| 86d9fc29cb | |||
| 506475de5f | |||
| cfe4532093 | |||
| 8fc88d63f1 | |||
| 6e74fd4945 | |||
| dcbac4cb4b | |||
| ed2462030f | |||
| cc5befbced | |||
| 2c89cd96a8 | |||
| a0304dc504 | |||
| c7941cca18 | |||
| b6dd32aa07 | |||
| f94886946e | |||
| 72dfe4c74f | |||
| 8b464d9660 | |||
| 889ebb2638 | |||
| 3ad986c28b | |||
| 344e193b7d | |||
| fb1c933ade | |||
| 72c5b97231 | |||
| fa93cd9f60 | |||
| aec9674dbe | |||
| 7fcc4223dc | |||
| 8262a3e23b | |||
| f211331c48 | |||
| 9053d0b134 | |||
| cb3f2d8d10 | |||
| c12df53b60 | |||
| d1aeea7553 | |||
| d8bccde686 | |||
| 20e489eaa1 | |||
| 4213475ec7 | |||
| d92879baf6 | |||
| 690fe019f0 | |||
| ed7a29d9f8 | |||
| 756848e79e | |||
| 18445edd0f | |||
| 30215ca61f | |||
| 838cedade7 | |||
| 4283a28c2f | |||
| 93a126fbc7 | |||
| 8e4b351a0c | |||
| 9869453c42 | |||
| 3642c59aa8 | |||
| 43eea2953b | |||
| de7eb10ce4 | |||
| fd11a325b8 | |||
| 4d17e20310 | |||
| 10fd1d7380 | |||
| 52b4f4a8d7 | |||
| e782e0a170 | |||
| dc2ceca5c5 | |||
| f8acd01ff7 | |||
| c48334d405 | |||
| 909fdaf152 | |||
| 8c1c926d00 | |||
| df6f3ce883 | |||
| 513f074766 | |||
| b07bf83c7d | |||
| 53e8cf53a4 | |||
| 54271bb766 | |||
| 9e96f56efb | |||
| b278911229 | |||
| 7bd0c7745c | |||
| 1cf0719ebd | |||
| 537d5ee025 | |||
| c8e5be35f7 | |||
| a6e72e1e4f | |||
| 5e83a7277f | |||
| 68af5f6c5c | |||
| 8de2901fea | |||
| c53e0730cb | |||
| a0e619e62a | |||
| 70116459c3 | |||
| 65e262b93b | |||
| 43faa0461a | |||
| 48cb2109b6 | |||
| a5450f11c9 | |||
| 9d98ab5ec6 | |||
| df5c879527 | |||
| 423e9f1cbe | |||
| 0bd7f8fca5 | |||
| d5615af9ae | |||
| 19dcc02a72 | |||
| 7feae92c1f | |||
| f851b84266 | |||
| fc966e9cc6 | |||
| ef19e67d2c | |||
| a41351f363 | |||
| 6aae216b4e | |||
| b22980a1dc | |||
| 881f735827 | |||
| 2f54045508 | |||
| 5aa6efb9a5 | |||
| 6ca0234478 | |||
| 649818995f | |||
| 7a0a9da72b | |||
| 69bff9bc89 | |||
| 41ca7eb491 | |||
| eef364723c | |||
| 0d6e187e88 | |||
| 9420a1fc30 | |||
| 583e900996 | |||
| 05e1fbfc52 | |||
| fe92176321 | |||
| 6d0df0ebeb | |||
| 0fa939e2d1 | |||
| 0422ce109f | |||
| 47bdee409c | |||
| 49f189439d | |||
| 5adf6f6b7f | |||
| 4115f19958 | |||
| 340d7b1b21 | |||
| 1bcbcbf574 | |||
| 82e43b2d7e | |||
| 67309a1cb5 | |||
| b724afe343 | |||
| 21f4f1c9a4 | |||
| b0c1f6202d | |||
| c0dfd97519 | |||
| a9138e85b1 | |||
| 0a05ed57e6 | |||
| 14288d1332 | |||
| b411418ff0 | |||
| 2bc0f72ae5 | |||
| 9c1244de57 | |||
| db2f8d915c | |||
| 6167c0e5d2 | |||
| ed2e464653 | |||
| 2c8ed8ee48 | |||
| ed50f46641 | |||
| 46e678bcff | |||
| 6b2427f995 | |||
| b07d741661 | |||
| 41fb013d29 | |||
| 32d4b669d0 | |||
| 3cde34a4a4 | |||
| bdb3660312 | |||
| f3a21e9c68 |
@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
@ -8,12 +9,12 @@ import zipfile
|
|||||||
# Note that we have 400 MiB quota, please use it wisely.
|
# Note that we have 400 MiB quota, please use it wisely.
|
||||||
# See https://github.com/pypi/support/issues/3792 .
|
# See https://github.com/pypi/support/issues/3792 .
|
||||||
# Please also sync the value with the one in Dockerfile.
|
# Please also sync the value with the one in Dockerfile.
|
||||||
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
|
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400))
|
||||||
|
|
||||||
|
|
||||||
def print_top_10_largest_files(zip_file):
|
def print_top_10_largest_files(zip_file):
|
||||||
"""Print the top 10 largest files in the given zip file."""
|
"""Print the top 10 largest files in the given zip file."""
|
||||||
with zipfile.ZipFile(zip_file, 'r') as z:
|
with zipfile.ZipFile(zip_file, "r") as z:
|
||||||
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
|
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
|
||||||
file_sizes.sort(key=lambda x: x[1], reverse=True)
|
file_sizes.sort(key=lambda x: x[1], reverse=True)
|
||||||
for f, size in file_sizes[:10]:
|
for f, size in file_sizes[:10]:
|
||||||
@ -28,14 +29,18 @@ def check_wheel_size(directory):
|
|||||||
wheel_path = os.path.join(root, file_name)
|
wheel_path = os.path.join(root, file_name)
|
||||||
wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
|
wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024)
|
||||||
if wheel_size_mb > VLLM_MAX_SIZE_MB:
|
if wheel_size_mb > VLLM_MAX_SIZE_MB:
|
||||||
print(f"Not allowed: Wheel {wheel_path} is larger "
|
print(
|
||||||
f"({wheel_size_mb:.2f} MB) than the limit "
|
f"Not allowed: Wheel {wheel_path} is larger "
|
||||||
f"({VLLM_MAX_SIZE_MB} MB).")
|
f"({wheel_size_mb:.2f} MB) than the limit "
|
||||||
|
f"({VLLM_MAX_SIZE_MB} MB)."
|
||||||
|
)
|
||||||
print_top_10_largest_files(wheel_path)
|
print_top_10_largest_files(wheel_path)
|
||||||
return 1
|
return 1
|
||||||
else:
|
else:
|
||||||
print(f"Wheel {wheel_path} is within the allowed size "
|
print(
|
||||||
f"({wheel_size_mb:.2f} MB).")
|
f"Wheel {wheel_path} is within the allowed size "
|
||||||
|
f"({wheel_size_mb:.2f} MB)."
|
||||||
|
)
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
@ -45,4 +50,4 @@ if __name__ == "__main__":
|
|||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
directory = sys.argv[1]
|
directory = sys.argv[1]
|
||||||
sys.exit(check_wheel_size(directory))
|
sys.exit(check_wheel_size(directory))
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
@ -22,5 +23,5 @@ with open("index.html", "w") as f:
|
|||||||
print(f"Generated index.html for {args.wheel}")
|
print(f"Generated index.html for {args.wheel}")
|
||||||
# cloudfront requires escaping the '+' character
|
# cloudfront requires escaping the '+' character
|
||||||
f.write(
|
f.write(
|
||||||
template.format(wheel=filename,
|
template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B"))
|
||||||
wheel_html_escaped=filename.replace("+", "%2B")))
|
)
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m deepseek-ai/DeepSeek-V2-Lite-Chat -b "auto" -l 1000 -f 5 -t 2
|
||||||
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
model_name: "deepseek-ai/DeepSeek-V2-Lite-Chat"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For hf script, without -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5
|
||||||
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
|
model_name: "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For hf script, without -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
|
||||||
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
|
model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8A8-FP8-Channelwise-compressed-tensors"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-FBGEMM-nonuniform"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test -b 32 -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
model_name: "nm-testing/Meta-Llama-3-8B-FP8-compressed-tensors-test"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
|
||||||
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Per-Token-Test"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
|
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-nonuniform-test"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5 -t 1
|
# For hf script, without -t option (tensor parallel size).
|
||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
|
||||||
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
|
model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
||||||
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Llama-3.2-1B-Instruct-FP8 -b "auto" -l 1319 -f 5 -t 1
|
||||||
|
model_name: "RedHatAI/Llama-3.2-1B-Instruct-FP8"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.335
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.323
|
||||||
|
limit: 1319
|
||||||
|
num_fewshot: 5
|
||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||||
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
|
model_name: "neuralmagic/Llama-3.2-1B-Instruct-quantized.w8a8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m mgoin/Minitron-4B-Base-FP8 -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "mgoin/Minitron-4B-Base-FP8"
|
model_name: "mgoin/Minitron-4B-Base-FP8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic -b "auto" -l 250 -f 5 -t 8
|
||||||
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
|
model_name: "neuralmagic/Mixtral-8x22B-Instruct-v0.1-FP8-dynamic"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b "auto" -l 250 -f 5 -t 4
|
||||||
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
|
model_name: "neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5 -t 4
|
# For hf script, without -t option (tensor parallel size).
|
||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
|
||||||
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16 -b auto -l 1319 -f 5 -t 1
|
||||||
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
|
model_name: "nm-testing/Qwen1.5-MoE-A2.7B-Chat-quantized.w4a16"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-FP8W8 -b auto -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
|
model_name: "nm-testing/Qwen2-1.5B-Instruct-FP8W8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8 -b "auto" -l 1000 -f 5 -t 1
|
||||||
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
|
model_name: "neuralmagic/Qwen2-1.5B-Instruct-quantized.w8a8"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
|
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b "auto" -l 250 -f 5 -t 4
|
||||||
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
|
model_name: "Qwen/Qwen2-57B-A14B-Instruct"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m Qwen/Qwen2.5-1.5B-Instruct -b auto -l 1319 -f 5 -t 1
|
||||||
|
model_name: "Qwen/Qwen2.5-1.5B-Instruct"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.54
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.59
|
||||||
|
limit: 1319
|
||||||
|
num_fewshot: 5
|
||||||
@ -0,0 +1,11 @@
|
|||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
|
||||||
|
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.47
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.64
|
||||||
|
limit: 1319
|
||||||
|
num_fewshot: 5
|
||||||
@ -1,3 +1,4 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
|
||||||
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
|
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
|
||||||
tasks:
|
tasks:
|
||||||
|
|||||||
@ -3,3 +3,4 @@ Meta-Llama-3-70B-Instruct.yaml
|
|||||||
Mixtral-8x7B-Instruct-v0.1.yaml
|
Mixtral-8x7B-Instruct-v0.1.yaml
|
||||||
Qwen2-57B-A14-Instruct.yaml
|
Qwen2-57B-A14-Instruct.yaml
|
||||||
DeepSeek-V2-Lite-Chat.yaml
|
DeepSeek-V2-Lite-Chat.yaml
|
||||||
|
Meta-Llama-3-8B-QQQ.yaml
|
||||||
|
|||||||
@ -1,10 +1,6 @@
|
|||||||
Meta-Llama-3-8B-Instruct.yaml
|
Qwen2.5-1.5B-Instruct.yaml
|
||||||
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
|
|
||||||
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
|
Meta-Llama-3.2-1B-Instruct-INT8-compressed-tensors.yaml
|
||||||
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
|
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
|
||||||
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
|
||||||
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
|
Qwen2.5-VL-3B-Instruct-FP8-dynamic.yaml
|
||||||
Qwen1.5-MoE-W4A16-compressed-tensors.yaml
|
Qwen1.5-MoE-W4A16-compressed-tensors.yaml
|
||||||
Qwen2-1.5B-Instruct-INT8-compressed-tensors.yaml
|
|
||||||
Qwen2-1.5B-Instruct-FP8W8.yaml
|
|
||||||
Meta-Llama-3-8B-QQQ.yaml
|
|
||||||
|
|||||||
44
.buildkite/lm-eval-harness/conftest.py
Normal file
44
.buildkite/lm-eval-harness/conftest.py
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
def pytest_addoption(parser):
|
||||||
|
parser.addoption(
|
||||||
|
"--config-list-file",
|
||||||
|
action="store",
|
||||||
|
help="Path to the file listing model config YAMLs (one per line)",
|
||||||
|
)
|
||||||
|
parser.addoption(
|
||||||
|
"--tp-size",
|
||||||
|
action="store",
|
||||||
|
default="1",
|
||||||
|
help="Tensor parallel size to use for evaluation",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def config_list_file(pytestconfig, config_dir):
|
||||||
|
rel_path = pytestconfig.getoption("--config-list-file")
|
||||||
|
return config_dir / rel_path
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="session")
|
||||||
|
def tp_size(pytestconfig):
|
||||||
|
return pytestconfig.getoption("--tp-size")
|
||||||
|
|
||||||
|
|
||||||
|
def pytest_generate_tests(metafunc):
|
||||||
|
if "config_filename" in metafunc.fixturenames:
|
||||||
|
rel_path = metafunc.config.getoption("--config-list-file")
|
||||||
|
config_list_file = Path(rel_path).resolve()
|
||||||
|
config_dir = config_list_file.parent
|
||||||
|
with open(config_list_file, encoding="utf-8") as f:
|
||||||
|
configs = [
|
||||||
|
config_dir / line.strip()
|
||||||
|
for line in f
|
||||||
|
if line.strip() and not line.startswith("#")
|
||||||
|
]
|
||||||
|
metafunc.parametrize("config_filename", configs)
|
||||||
@ -1,59 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
usage() {
|
|
||||||
echo``
|
|
||||||
echo "Runs lm eval harness on GSM8k using vllm and compares to "
|
|
||||||
echo "precomputed baseline (measured by HF transformers.)"
|
|
||||||
echo
|
|
||||||
echo "usage: ${0} <options>"
|
|
||||||
echo
|
|
||||||
echo " -c - path to the test data config (e.g. configs/small-models.txt)"
|
|
||||||
echo " -t - tensor parallel size"
|
|
||||||
echo
|
|
||||||
}
|
|
||||||
|
|
||||||
SUCCESS=0
|
|
||||||
|
|
||||||
while getopts "c:t:" OPT; do
|
|
||||||
case ${OPT} in
|
|
||||||
c )
|
|
||||||
CONFIG="$OPTARG"
|
|
||||||
;;
|
|
||||||
t )
|
|
||||||
TP_SIZE="$OPTARG"
|
|
||||||
;;
|
|
||||||
\? )
|
|
||||||
usage
|
|
||||||
exit 1
|
|
||||||
;;
|
|
||||||
esac
|
|
||||||
done
|
|
||||||
|
|
||||||
# Parse list of configs.
|
|
||||||
IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < "$CONFIG"
|
|
||||||
|
|
||||||
for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
|
|
||||||
do
|
|
||||||
LOCAL_SUCCESS=0
|
|
||||||
|
|
||||||
echo "=== RUNNING MODEL: $MODEL_CONFIG WITH TP SIZE: $TP_SIZE==="
|
|
||||||
|
|
||||||
export LM_EVAL_TEST_DATA_FILE=$PWD/configs/${MODEL_CONFIG}
|
|
||||||
export LM_EVAL_TP_SIZE=$TP_SIZE
|
|
||||||
pytest -s test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
|
|
||||||
|
|
||||||
if [[ $LOCAL_SUCCESS == 0 ]]; then
|
|
||||||
echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
|
|
||||||
else
|
|
||||||
echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
|
|
||||||
fi
|
|
||||||
|
|
||||||
SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
|
|
||||||
|
|
||||||
done
|
|
||||||
|
|
||||||
if [ "${SUCCESS}" -eq "0" ]; then
|
|
||||||
exit 0
|
|
||||||
else
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
@ -1,69 +1,55 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""
|
"""
|
||||||
LM eval harness on model to compare vs HF baseline computed offline.
|
LM eval harness on model to compare vs HF baseline computed offline.
|
||||||
Configs are found in configs/$MODEL.yaml
|
Configs are found in configs/$MODEL.yaml
|
||||||
|
|
||||||
* export LM_EVAL_TEST_DATA_FILE=configs/Meta-Llama-3-70B-Instruct.yaml
|
pytest -s -v test_lm_eval_correctness.py \
|
||||||
* export LM_EVAL_TP_SIZE=4
|
--config-list-file=configs/models-small.txt \
|
||||||
* pytest -s test_lm_eval_correctness.py
|
--tp-size=1
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import lm_eval
|
import lm_eval
|
||||||
import numpy
|
import numpy as np
|
||||||
import pytest
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
RTOL = 0.08
|
RTOL = 0.08
|
||||||
TEST_DATA_FILE = os.environ.get(
|
|
||||||
"LM_EVAL_TEST_DATA_FILE",
|
|
||||||
".buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")
|
|
||||||
|
|
||||||
TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)
|
|
||||||
|
|
||||||
|
|
||||||
def launch_lm_eval(eval_config):
|
def launch_lm_eval(eval_config, tp_size):
|
||||||
trust_remote_code = eval_config.get('trust_remote_code', False)
|
trust_remote_code = eval_config.get("trust_remote_code", False)
|
||||||
|
model_args = (
|
||||||
model_args = f"pretrained={eval_config['model_name']}," \
|
f"pretrained={eval_config['model_name']},"
|
||||||
f"tensor_parallel_size={TP_SIZE}," \
|
f"tensor_parallel_size={tp_size},"
|
||||||
f"add_bos_token=true," \
|
f"enforce_eager=true,"
|
||||||
f"trust_remote_code={trust_remote_code}"
|
f"add_bos_token=true,"
|
||||||
|
f"trust_remote_code={trust_remote_code}"
|
||||||
|
)
|
||||||
results = lm_eval.simple_evaluate(
|
results = lm_eval.simple_evaluate(
|
||||||
model="vllm",
|
model="vllm",
|
||||||
model_args=model_args,
|
model_args=model_args,
|
||||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||||
num_fewshot=eval_config["num_fewshot"],
|
num_fewshot=eval_config["num_fewshot"],
|
||||||
limit=eval_config["limit"],
|
limit=eval_config["limit"],
|
||||||
batch_size="auto")
|
batch_size="auto",
|
||||||
|
)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def test_lm_eval_correctness():
|
def test_lm_eval_correctness_param(config_filename, tp_size):
|
||||||
eval_config = yaml.safe_load(
|
eval_config = yaml.safe_load(config_filename.read_text(encoding="utf-8"))
|
||||||
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
|
|
||||||
|
|
||||||
if eval_config[
|
results = launch_lm_eval(eval_config, tp_size)
|
||||||
"model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform": #noqa: E501
|
|
||||||
pytest.skip("FBGEMM is currently failing on main.")
|
|
||||||
|
|
||||||
# Launch eval requests.
|
|
||||||
results = launch_lm_eval(eval_config)
|
|
||||||
|
|
||||||
# Confirm scores match ground truth.
|
|
||||||
success = True
|
success = True
|
||||||
for task in eval_config["tasks"]:
|
for task in eval_config["tasks"]:
|
||||||
for metric in task["metrics"]:
|
for metric in task["metrics"]:
|
||||||
ground_truth = metric["value"]
|
ground_truth = metric["value"]
|
||||||
measured_value = results["results"][task["name"]][metric["name"]]
|
measured_value = results["results"][task["name"]][metric["name"]]
|
||||||
print(f'{task["name"]} | {metric["name"]}: '
|
print(
|
||||||
f'ground_truth={ground_truth} | measured={measured_value}')
|
f"{task['name']} | {metric['name']}: "
|
||||||
success = success and numpy.isclose(
|
f"ground_truth={ground_truth} | measured={measured_value}"
|
||||||
ground_truth, measured_value, rtol=RTOL)
|
)
|
||||||
|
success = success and np.isclose(ground_truth, measured_value, rtol=RTOL)
|
||||||
|
|
||||||
# Assert at the end, print all scores even on failure for debugging.
|
|
||||||
assert success
|
assert success
|
||||||
|
|||||||
@ -11,7 +11,7 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performanc
|
|||||||
|
|
||||||
## Performance benchmark quick overview
|
## Performance benchmark quick overview
|
||||||
|
|
||||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!), with different models.
|
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models.
|
||||||
|
|
||||||
**Benchmarking Duration**: about 1hr.
|
**Benchmarking Duration**: about 1hr.
|
||||||
|
|
||||||
@ -31,13 +31,27 @@ Performance benchmark will be triggered when:
|
|||||||
- A PR being merged into vllm.
|
- A PR being merged into vllm.
|
||||||
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
|
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
|
||||||
|
|
||||||
|
Manually Trigger the benchmark
|
||||||
|
|
||||||
|
```bash
|
||||||
|
bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
Runtime environment variables:
|
||||||
|
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
|
||||||
|
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
|
||||||
|
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
|
||||||
|
- `THROUGHPUT_JSON`: JSON file to use for the throughout tests. Default value is empty string (use default file).
|
||||||
|
- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
|
||||||
|
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
|
||||||
|
|
||||||
Nightly benchmark will be triggered when:
|
Nightly benchmark will be triggered when:
|
||||||
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
|
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
|
||||||
|
|
||||||
## Performance benchmark details
|
## Performance benchmark details
|
||||||
|
|
||||||
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
||||||
|
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
|
||||||
### Latency test
|
### Latency test
|
||||||
|
|
||||||
Here is an example of one test inside `latency-tests.json`:
|
Here is an example of one test inside `latency-tests.json`:
|
||||||
@ -113,12 +127,36 @@ WARNING: The benchmarking script will save json results by itself, so please do
|
|||||||
|
|
||||||
### Visualizing the results
|
### Visualizing the results
|
||||||
|
|
||||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
|
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
|
||||||
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
||||||
If you do not see the table, please wait till the benchmark finish running.
|
If you do not see the table, please wait till the benchmark finish running.
|
||||||
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
||||||
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
|
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
|
||||||
|
|
||||||
|
The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
|
||||||
|
When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
|
||||||
|
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
|
||||||
|
|
||||||
|
Here is an example using the script to compare result_a and result_b without detail test name.
|
||||||
|
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json --ignore_test_name`
|
||||||
|
|
||||||
|
| | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
|
||||||
|
|----|----------------------------------------|----------------------------------------|----------|
|
||||||
|
| 0 | 142.633982 | 156.526018 | 1.097396 |
|
||||||
|
| 1 | 241.620334 | 294.018783 | 1.216863 |
|
||||||
|
| 2 | 218.298905 | 262.664916 | 1.203235 |
|
||||||
|
| 3 | 242.743860 | 299.816190 | 1.235113 |
|
||||||
|
|
||||||
|
Here is an example using the script to compare result_a and result_b with detail test name.
|
||||||
|
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
|
||||||
|
| | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio |
|
||||||
|
|---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------|
|
||||||
|
| 0 | serving_llama8B_tp1_sharegpt_qps_1 | 142.633982 | serving_llama8B_tp1_sharegpt_qps_1 | 156.526018 | 1.097396 |
|
||||||
|
| 1 | serving_llama8B_tp1_sharegpt_qps_16 | 241.620334 | serving_llama8B_tp1_sharegpt_qps_16 | 294.018783 | 1.216863 |
|
||||||
|
| 2 | serving_llama8B_tp1_sharegpt_qps_4 | 218.298905 | serving_llama8B_tp1_sharegpt_qps_4 | 262.664916 | 1.203235 |
|
||||||
|
| 3 | serving_llama8B_tp1_sharegpt_qps_inf | 242.743860 | serving_llama8B_tp1_sharegpt_qps_inf | 299.816190 | 1.235113 |
|
||||||
|
| 4 | serving_llama8B_tp2_random_1024_128_qps_1 | 96.613390 | serving_llama8B_tp4_random_1024_128_qps_1 | 108.404853 | 1.122048 |
|
||||||
|
|
||||||
## Nightly test details
|
## Nightly test details
|
||||||
|
|
||||||
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
|
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
|
||||||
|
|||||||
@ -16,7 +16,7 @@ Please download the visualization scripts in the post
|
|||||||
- Download `nightly-benchmarks.zip`.
|
- Download `nightly-benchmarks.zip`.
|
||||||
- In the same folder, run the following code:
|
- In the same folder, run the following code:
|
||||||
|
|
||||||
```console
|
```bash
|
||||||
export HF_TOKEN=<your HF token>
|
export HF_TOKEN=<your HF token>
|
||||||
apt update
|
apt update
|
||||||
apt install -y git
|
apt install -y git
|
||||||
|
|||||||
@ -4,7 +4,8 @@
|
|||||||
- Input length: 32 tokens.
|
- Input length: 32 tokens.
|
||||||
- Output length: 128 tokens.
|
- Output length: 128 tokens.
|
||||||
- Batch size: fixed (8).
|
- Batch size: fixed (8).
|
||||||
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
|
- CPU Models: llama-3.1 8B.
|
||||||
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
||||||
|
|
||||||
{latency_tests_markdown_table}
|
{latency_tests_markdown_table}
|
||||||
@ -14,7 +15,8 @@
|
|||||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||||
- Output length: the corresponding output length of these 200 prompts.
|
- Output length: the corresponding output length of these 200 prompts.
|
||||||
- Batch size: dynamically determined by vllm to achieve maximum throughput.
|
- Batch size: dynamically determined by vllm to achieve maximum throughput.
|
||||||
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
|
- CPU Models: llama-3.1 8B.
|
||||||
- Evaluation metrics: throughput.
|
- Evaluation metrics: throughput.
|
||||||
|
|
||||||
{throughput_tests_markdown_table}
|
{throughput_tests_markdown_table}
|
||||||
@ -25,12 +27,18 @@
|
|||||||
- Output length: the corresponding output length of these 200 prompts.
|
- Output length: the corresponding output length of these 200 prompts.
|
||||||
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
|
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
|
||||||
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
||||||
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
- We also added a speculative decoding test for llama-3 70B, under QPS 2
|
- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
|
||||||
|
- CPU Models: llama-3.1 8B.
|
||||||
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
||||||
|
- For CPU, we added random dataset tests to benchmark fixed input/output length with 100 prompts.
|
||||||
|
|
||||||
{serving_tests_markdown_table}
|
{serving_tests_markdown_table}
|
||||||
|
|
||||||
|
## Platform Information
|
||||||
|
|
||||||
|
{platform_markdown_table}
|
||||||
|
|
||||||
## json version of the benchmarking tables
|
## json version of the benchmarking tables
|
||||||
|
|
||||||
This section contains the data of the markdown tables above in JSON format.
|
This section contains the data of the markdown tables above in JSON format.
|
||||||
|
|||||||
@ -0,0 +1,66 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def compare_data_columns(
|
||||||
|
files, name_column, data_column, drop_column, ignore_test_name=False
|
||||||
|
):
|
||||||
|
print("\ncompare_data_column: " + data_column)
|
||||||
|
frames = []
|
||||||
|
compare_frames = []
|
||||||
|
for file in files:
|
||||||
|
data_df = pd.read_json(file)
|
||||||
|
serving_df = data_df.dropna(subset=[drop_column], ignore_index=True)
|
||||||
|
if ignore_test_name is False:
|
||||||
|
serving_df = serving_df.rename(columns={name_column: file + "_name"})
|
||||||
|
frames.append(serving_df[file + "_name"])
|
||||||
|
serving_df = serving_df.rename(columns={data_column: file})
|
||||||
|
frames.append(serving_df[file])
|
||||||
|
compare_frames.append(serving_df[file])
|
||||||
|
if len(compare_frames) >= 2:
|
||||||
|
# Compare numbers among two files
|
||||||
|
ratio_df = compare_frames[1] / compare_frames[0]
|
||||||
|
frames.append(ratio_df)
|
||||||
|
compare_frames.pop(1)
|
||||||
|
|
||||||
|
concat_df = pd.concat(frames, axis=1)
|
||||||
|
return concat_df
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"-f", "--file", action="append", type=str, help="input file name"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--ignore_test_name", action="store_true", help="ignore_test_name or not"
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
files = args.file
|
||||||
|
print("comparing : " + ", ".join(files))
|
||||||
|
|
||||||
|
drop_column = "P99"
|
||||||
|
name_column = "Test name"
|
||||||
|
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
|
||||||
|
html_msgs_for_data_cols = [
|
||||||
|
"Compare Output Tokens /n",
|
||||||
|
"Median TTFT /n",
|
||||||
|
"Median TPOT /n",
|
||||||
|
]
|
||||||
|
ignore_test_name = args.ignore_test_name
|
||||||
|
with open("perf_comparison.html", "w") as text_file:
|
||||||
|
for i in range(len(data_cols_to_compare)):
|
||||||
|
output_df = compare_data_columns(
|
||||||
|
files,
|
||||||
|
name_column,
|
||||||
|
data_cols_to_compare[i],
|
||||||
|
drop_column,
|
||||||
|
ignore_test_name=ignore_test_name,
|
||||||
|
)
|
||||||
|
print(output_df)
|
||||||
|
html = output_df.to_html()
|
||||||
|
text_file.write(html_msgs_for_data_cols[i])
|
||||||
|
text_file.write(html)
|
||||||
@ -1,10 +1,13 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
from importlib import util
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import psutil
|
||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
|
|
||||||
results_folder = Path("results/")
|
results_folder = Path("results/")
|
||||||
@ -28,11 +31,11 @@ throughput_results = []
|
|||||||
throughput_results_column_mapping = {
|
throughput_results_column_mapping = {
|
||||||
"test_name": "Test name",
|
"test_name": "Test name",
|
||||||
"gpu_type": "GPU",
|
"gpu_type": "GPU",
|
||||||
# "num_requests": "# of req.",
|
"num_requests": "# of req.",
|
||||||
# "total_num_tokens": "Total # of tokens",
|
"total_num_tokens": "Total # of tokens",
|
||||||
# "elapsed_time": "Elapsed time (s)",
|
"elapsed_time": "Elapsed time (s)",
|
||||||
"requests_per_second": "Tput (req/s)",
|
"requests_per_second": "Tput (req/s)",
|
||||||
# "tokens_per_second": "Tput (tok/s)",
|
"tokens_per_second": "Tput (tok/s)",
|
||||||
}
|
}
|
||||||
|
|
||||||
# serving results and the keys that will be printed into markdown
|
# serving results and the keys that will be printed into markdown
|
||||||
@ -40,16 +43,18 @@ serving_results = []
|
|||||||
serving_column_mapping = {
|
serving_column_mapping = {
|
||||||
"test_name": "Test name",
|
"test_name": "Test name",
|
||||||
"gpu_type": "GPU",
|
"gpu_type": "GPU",
|
||||||
# "completed": "# of req.",
|
"completed": "# of req.",
|
||||||
"request_throughput": "Tput (req/s)",
|
"request_throughput": "Tput (req/s)",
|
||||||
# "input_throughput": "Input Tput (tok/s)",
|
"total_token_throughput": "Total Token Tput (tok/s)",
|
||||||
# "output_throughput": "Output Tput (tok/s)",
|
"output_throughput": "Output Tput (tok/s)",
|
||||||
|
"total_input_tokens": "Total input tokens",
|
||||||
|
"total_output_tokens": "Total output tokens",
|
||||||
"mean_ttft_ms": "Mean TTFT (ms)",
|
"mean_ttft_ms": "Mean TTFT (ms)",
|
||||||
"median_ttft_ms": "Median TTFT (ms)",
|
"median_ttft_ms": "Median TTFT (ms)",
|
||||||
"p99_ttft_ms": "P99 TTFT (ms)",
|
"p99_ttft_ms": "P99 TTFT (ms)",
|
||||||
# "mean_tpot_ms": "Mean TPOT (ms)",
|
"mean_tpot_ms": "Mean TPOT (ms)",
|
||||||
# "median_tpot_ms": "Median",
|
"median_tpot_ms": "Median",
|
||||||
# "p99_tpot_ms": "P99",
|
"p99_tpot_ms": "P99",
|
||||||
"mean_itl_ms": "Mean ITL (ms)",
|
"mean_itl_ms": "Mean ITL (ms)",
|
||||||
"median_itl_ms": "Median ITL (ms)",
|
"median_itl_ms": "Median ITL (ms)",
|
||||||
"p99_itl_ms": "P99 ITL (ms)",
|
"p99_itl_ms": "P99 ITL (ms)",
|
||||||
@ -65,18 +70,32 @@ def read_markdown(file):
|
|||||||
|
|
||||||
|
|
||||||
def results_to_json(latency, throughput, serving):
|
def results_to_json(latency, throughput, serving):
|
||||||
return json.dumps({
|
return json.dumps(
|
||||||
'latency': latency.to_dict(),
|
{
|
||||||
'throughput': throughput.to_dict(),
|
"latency": latency.to_dict(),
|
||||||
'serving': serving.to_dict()
|
"throughput": throughput.to_dict(),
|
||||||
})
|
"serving": serving.to_dict(),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_size_with_unit(bytes, suffix="B"):
|
||||||
|
"""
|
||||||
|
Scale bytes to its proper format
|
||||||
|
e.g:
|
||||||
|
1253656 => '1.20MB'
|
||||||
|
1253656678 => '1.17GB'
|
||||||
|
"""
|
||||||
|
factor = 1024
|
||||||
|
for unit in ["", "K", "M", "G", "T", "P"]:
|
||||||
|
if bytes < factor:
|
||||||
|
return f"{bytes:.2f}{unit}{suffix}"
|
||||||
|
bytes /= factor
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
# collect results
|
# collect results
|
||||||
for test_file in results_folder.glob("*.json"):
|
for test_file in results_folder.glob("*.json"):
|
||||||
|
|
||||||
with open(test_file) as f:
|
with open(test_file) as f:
|
||||||
raw_result = json.loads(f.read())
|
raw_result = json.loads(f.read())
|
||||||
|
|
||||||
@ -120,7 +139,8 @@ if __name__ == "__main__":
|
|||||||
for perc in [10, 25, 50, 75, 90, 99]:
|
for perc in [10, 25, 50, 75, 90, 99]:
|
||||||
# Multiply 1000 to convert the time unit from s to ms
|
# Multiply 1000 to convert the time unit from s to ms
|
||||||
raw_result.update(
|
raw_result.update(
|
||||||
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]})
|
{f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}
|
||||||
|
)
|
||||||
raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
|
raw_result["avg_latency"] = raw_result["avg_latency"] * 1000
|
||||||
|
|
||||||
# add the result to raw_result
|
# add the result to raw_result
|
||||||
@ -153,26 +173,48 @@ if __name__ == "__main__":
|
|||||||
serving_results = pd.DataFrame.from_dict(serving_results)
|
serving_results = pd.DataFrame.from_dict(serving_results)
|
||||||
throughput_results = pd.DataFrame.from_dict(throughput_results)
|
throughput_results = pd.DataFrame.from_dict(throughput_results)
|
||||||
|
|
||||||
raw_results_json = results_to_json(latency_results, throughput_results,
|
svmem = psutil.virtual_memory()
|
||||||
serving_results)
|
platform_data = {
|
||||||
|
"Physical cores": [psutil.cpu_count(logical=False)],
|
||||||
|
"Total cores": [psutil.cpu_count(logical=True)],
|
||||||
|
"Total Memory": [get_size_with_unit(svmem.total)],
|
||||||
|
}
|
||||||
|
|
||||||
|
if util.find_spec("numa") is not None:
|
||||||
|
from numa import info
|
||||||
|
|
||||||
|
platform_data["Total NUMA nodes"] = [info.get_num_configured_nodes()]
|
||||||
|
|
||||||
|
if util.find_spec("cpuinfo") is not None:
|
||||||
|
from cpuinfo import get_cpu_info
|
||||||
|
|
||||||
|
platform_data["CPU Brand"] = [get_cpu_info()["brand_raw"]]
|
||||||
|
|
||||||
|
platform_results = pd.DataFrame.from_dict(
|
||||||
|
platform_data, orient="index", columns=["Platform Info"]
|
||||||
|
)
|
||||||
|
|
||||||
|
raw_results_json = results_to_json(
|
||||||
|
latency_results, throughput_results, serving_results
|
||||||
|
)
|
||||||
|
|
||||||
# remapping the key, for visualization purpose
|
# remapping the key, for visualization purpose
|
||||||
if not latency_results.empty:
|
if not latency_results.empty:
|
||||||
latency_results = latency_results[list(
|
latency_results = latency_results[list(latency_column_mapping.keys())].rename(
|
||||||
latency_column_mapping.keys())].rename(
|
columns=latency_column_mapping
|
||||||
columns=latency_column_mapping)
|
)
|
||||||
if not serving_results.empty:
|
if not serving_results.empty:
|
||||||
serving_results = serving_results[list(
|
serving_results = serving_results[list(serving_column_mapping.keys())].rename(
|
||||||
serving_column_mapping.keys())].rename(
|
columns=serving_column_mapping
|
||||||
columns=serving_column_mapping)
|
)
|
||||||
if not throughput_results.empty:
|
if not throughput_results.empty:
|
||||||
throughput_results = throughput_results[list(
|
throughput_results = throughput_results[
|
||||||
throughput_results_column_mapping.keys())].rename(
|
list(throughput_results_column_mapping.keys())
|
||||||
columns=throughput_results_column_mapping)
|
].rename(columns=throughput_results_column_mapping)
|
||||||
|
|
||||||
processed_results_json = results_to_json(latency_results,
|
processed_results_json = results_to_json(
|
||||||
throughput_results,
|
latency_results, throughput_results, serving_results
|
||||||
serving_results)
|
)
|
||||||
|
|
||||||
for df in [latency_results, serving_results, throughput_results]:
|
for df in [latency_results, serving_results, throughput_results]:
|
||||||
if df.empty:
|
if df.empty:
|
||||||
@ -184,38 +226,43 @@ if __name__ == "__main__":
|
|||||||
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
||||||
# we want to turn it into "8xGPUTYPE"
|
# we want to turn it into "8xGPUTYPE"
|
||||||
df["GPU"] = df["GPU"].apply(
|
df["GPU"] = df["GPU"].apply(
|
||||||
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}")
|
lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
|
||||||
|
)
|
||||||
|
|
||||||
# get markdown tables
|
# get markdown tables
|
||||||
latency_md_table = tabulate(latency_results,
|
latency_md_table = tabulate(
|
||||||
headers='keys',
|
latency_results, headers="keys", tablefmt="pipe", showindex=False
|
||||||
tablefmt='pipe',
|
)
|
||||||
showindex=False)
|
serving_md_table = tabulate(
|
||||||
serving_md_table = tabulate(serving_results,
|
serving_results, headers="keys", tablefmt="pipe", showindex=False
|
||||||
headers='keys',
|
)
|
||||||
tablefmt='pipe',
|
throughput_md_table = tabulate(
|
||||||
showindex=False)
|
throughput_results, headers="keys", tablefmt="pipe", showindex=False
|
||||||
throughput_md_table = tabulate(throughput_results,
|
)
|
||||||
headers='keys',
|
platform_md_table = tabulate(
|
||||||
tablefmt='pipe',
|
platform_results, headers="keys", tablefmt="pipe", showindex=True
|
||||||
showindex=False)
|
)
|
||||||
|
|
||||||
# document the result
|
# document the result
|
||||||
with open(results_folder / "benchmark_results.md", "w") as f:
|
with open(results_folder / "benchmark_results.md", "w") as f:
|
||||||
|
results = read_markdown(
|
||||||
results = read_markdown("../.buildkite/nightly-benchmarks/" +
|
"../.buildkite/nightly-benchmarks/"
|
||||||
"performance-benchmarks-descriptions.md")
|
+ "performance-benchmarks-descriptions.md"
|
||||||
|
)
|
||||||
results = results.format(
|
results = results.format(
|
||||||
latency_tests_markdown_table=latency_md_table,
|
latency_tests_markdown_table=latency_md_table,
|
||||||
throughput_tests_markdown_table=throughput_md_table,
|
throughput_tests_markdown_table=throughput_md_table,
|
||||||
serving_tests_markdown_table=serving_md_table,
|
serving_tests_markdown_table=serving_md_table,
|
||||||
benchmarking_results_in_json_string=processed_results_json)
|
platform_markdown_table=platform_md_table,
|
||||||
|
benchmarking_results_in_json_string=processed_results_json,
|
||||||
|
)
|
||||||
f.write(results)
|
f.write(results)
|
||||||
|
|
||||||
# document benchmarking results in json
|
# document benchmarking results in json
|
||||||
with open(results_folder / "benchmark_results.json", "w") as f:
|
with open(results_folder / "benchmark_results.json", "w") as f:
|
||||||
|
results = (
|
||||||
results = latency_results.to_dict(
|
latency_results.to_dict(orient="records")
|
||||||
orient='records') + throughput_results.to_dict(
|
+ throughput_results.to_dict(orient="records")
|
||||||
orient='records') + serving_results.to_dict(orient='records')
|
+ serving_results.to_dict(orient="records")
|
||||||
|
)
|
||||||
f.write(json.dumps(results))
|
f.write(json.dumps(results))
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
@ -14,15 +15,12 @@ def main(model, cachedir):
|
|||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Download and save Hugging Face tokenizer")
|
description="Download and save Hugging Face tokenizer"
|
||||||
parser.add_argument("--model",
|
)
|
||||||
type=str,
|
parser.add_argument("--model", type=str, required=True, help="Name of the model")
|
||||||
required=True,
|
parser.add_argument(
|
||||||
help="Name of the model")
|
"--cachedir", type=str, required=True, help="Directory to save the tokenizer"
|
||||||
parser.add_argument("--cachedir",
|
)
|
||||||
type=str,
|
|
||||||
required=True,
|
|
||||||
help="Directory to save the tokenizer")
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args.model, args.cachedir)
|
main(args.model, args.cachedir)
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
@ -11,33 +12,33 @@ from tabulate import tabulate
|
|||||||
|
|
||||||
def parse_arguments():
|
def parse_arguments():
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description=
|
description="Parse command line arguments for summary-nightly-results script."
|
||||||
'Parse command line arguments for summary-nightly-results script.')
|
)
|
||||||
parser.add_argument('--results-folder',
|
parser.add_argument(
|
||||||
type=str,
|
"--results-folder",
|
||||||
required=True,
|
type=str,
|
||||||
help='The folder where the results are stored.')
|
required=True,
|
||||||
parser.add_argument('--description',
|
help="The folder where the results are stored.",
|
||||||
type=str,
|
)
|
||||||
required=True,
|
parser.add_argument(
|
||||||
help='Description of the results.')
|
"--description", type=str, required=True, help="Description of the results."
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
return args
|
return args
|
||||||
|
|
||||||
|
|
||||||
def get_perf(df, method, model, metric):
|
def get_perf(df, method, model, metric):
|
||||||
|
|
||||||
means = []
|
means = []
|
||||||
|
|
||||||
for qps in [2, 4, 8, 16, "inf"]:
|
for qps in [2, 4, 8, 16, "inf"]:
|
||||||
target = df['Test name'].str.contains(model)
|
target = df["Test name"].str.contains(model)
|
||||||
target = target & df['Engine'].str.contains(method)
|
target = target & df["Engine"].str.contains(method)
|
||||||
target = target & df['Test name'].str.contains("qps_" + str(qps))
|
target = target & df["Test name"].str.contains("qps_" + str(qps))
|
||||||
filtered_df = df[target]
|
filtered_df = df[target]
|
||||||
|
|
||||||
if filtered_df.empty:
|
if filtered_df.empty:
|
||||||
means.append(0.)
|
means.append(0.0)
|
||||||
else:
|
else:
|
||||||
means.append(filtered_df[metric].values[0])
|
means.append(filtered_df[metric].values[0])
|
||||||
|
|
||||||
@ -45,7 +46,6 @@ def get_perf(df, method, model, metric):
|
|||||||
|
|
||||||
|
|
||||||
def get_perf_w_std(df, method, model, metric):
|
def get_perf_w_std(df, method, model, metric):
|
||||||
|
|
||||||
if metric in ["TTFT", "ITL"]:
|
if metric in ["TTFT", "ITL"]:
|
||||||
mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
|
mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
|
||||||
mean = mean.tolist()
|
mean = mean.tolist()
|
||||||
@ -60,7 +60,8 @@ def get_perf_w_std(df, method, model, metric):
|
|||||||
else:
|
else:
|
||||||
assert metric == "Tput"
|
assert metric == "Tput"
|
||||||
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
|
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
|
||||||
df, method, model, "Output Tput (tok/s)")
|
df, method, model, "Output Tput (tok/s)"
|
||||||
|
)
|
||||||
mean = mean.tolist()
|
mean = mean.tolist()
|
||||||
std = None
|
std = None
|
||||||
|
|
||||||
@ -80,18 +81,17 @@ def main(args):
|
|||||||
# generate markdown table
|
# generate markdown table
|
||||||
df = pd.DataFrame.from_dict(results)
|
df = pd.DataFrame.from_dict(results)
|
||||||
|
|
||||||
md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
|
md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
|
||||||
|
|
||||||
with open(args.description) as f:
|
with open(args.description) as f:
|
||||||
description = f.read()
|
description = f.read()
|
||||||
|
|
||||||
description = description.format(
|
description = description.format(nightly_results_benchmarking_table=md_table)
|
||||||
nightly_results_benchmarking_table=md_table)
|
|
||||||
|
|
||||||
with open("nightly_results.md", "w") as f:
|
with open("nightly_results.md", "w") as f:
|
||||||
f.write(description)
|
f.write(description)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
args = parse_arguments()
|
args = parse_arguments()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
from lmdeploy.serve.openai.api_client import APIClient
|
from lmdeploy.serve.openai.api_client import APIClient
|
||||||
|
|
||||||
|
|||||||
@ -31,6 +31,20 @@ check_gpus() {
|
|||||||
echo "GPU type is $gpu_type"
|
echo "GPU type is $gpu_type"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
check_cpus() {
|
||||||
|
# check the number of CPUs and NUMA Node and GPU type.
|
||||||
|
declare -g numa_count=$(python3 -c "from numa import info;numa_size = info.get_num_configured_nodes(); print(numa_size)")
|
||||||
|
if [[ $numa_count -gt 0 ]]; then
|
||||||
|
echo "NUMA found."
|
||||||
|
echo $numa_count
|
||||||
|
else
|
||||||
|
echo "Need at least 1 NUMA to run benchmarking."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
declare -g gpu_type="cpu"
|
||||||
|
echo "GPU type is $gpu_type"
|
||||||
|
}
|
||||||
|
|
||||||
check_hf_token() {
|
check_hf_token() {
|
||||||
# check if HF_TOKEN is available and valid
|
# check if HF_TOKEN is available and valid
|
||||||
if [[ -z "$HF_TOKEN" ]]; then
|
if [[ -z "$HF_TOKEN" ]]; then
|
||||||
@ -69,6 +83,22 @@ json2args() {
|
|||||||
echo "$args"
|
echo "$args"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
json2envs() {
|
||||||
|
# transforms the JSON string to environment variables.
|
||||||
|
# example:
|
||||||
|
# input: { "VLLM_CPU_KVCACHE_SPACE": 5 }
|
||||||
|
# output: VLLM_CPU_KVCACHE_SPACE=5
|
||||||
|
local json_string=$1
|
||||||
|
local args=$(
|
||||||
|
echo "$json_string" | jq -r '
|
||||||
|
to_entries |
|
||||||
|
map((.key ) + "=" + (.value | tostring)) |
|
||||||
|
join(" ")
|
||||||
|
'
|
||||||
|
)
|
||||||
|
echo "$args"
|
||||||
|
}
|
||||||
|
|
||||||
wait_for_server() {
|
wait_for_server() {
|
||||||
# wait for vllm server to start
|
# wait for vllm server to start
|
||||||
# return 1 if vllm server crashes
|
# return 1 if vllm server crashes
|
||||||
@ -158,15 +188,24 @@ run_latency_tests() {
|
|||||||
# get arguments
|
# get arguments
|
||||||
latency_params=$(echo "$params" | jq -r '.parameters')
|
latency_params=$(echo "$params" | jq -r '.parameters')
|
||||||
latency_args=$(json2args "$latency_params")
|
latency_args=$(json2args "$latency_params")
|
||||||
|
latency_environment_variables=$(echo "$params" | jq -r '.environment_variables')
|
||||||
|
latency_envs=$(json2envs "$latency_environment_variables")
|
||||||
|
|
||||||
# check if there is enough GPU to run the test
|
# check if there is enough GPU to run the test
|
||||||
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
|
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
|
||||||
if [[ $gpu_count -lt $tp ]]; then
|
if [ "$ON_CPU" == "1" ];then
|
||||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
if [[ $numa_count -lt $tp ]]; then
|
||||||
continue
|
echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
|
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
latency_command="python3 benchmark_latency.py \
|
latency_command=" $latency_envs python3 benchmark_latency.py \
|
||||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||||
$latency_args"
|
$latency_args"
|
||||||
|
|
||||||
@ -216,15 +255,24 @@ run_throughput_tests() {
|
|||||||
# get arguments
|
# get arguments
|
||||||
throughput_params=$(echo "$params" | jq -r '.parameters')
|
throughput_params=$(echo "$params" | jq -r '.parameters')
|
||||||
throughput_args=$(json2args "$throughput_params")
|
throughput_args=$(json2args "$throughput_params")
|
||||||
|
throughput_environment_variables=$(echo "$params" | jq -r '.environment_variables')
|
||||||
|
throughput_envs=$(json2envs "$throughput_environment_variables")
|
||||||
|
|
||||||
# check if there is enough GPU to run the test
|
# check if there is enough GPU to run the test
|
||||||
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
|
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
|
||||||
if [[ $gpu_count -lt $tp ]]; then
|
if [ "$ON_CPU" == "1" ];then
|
||||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
if [[ $numa_count -lt $tp ]]; then
|
||||||
continue
|
echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
|
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
throughput_command="python3 benchmark_throughput.py \
|
throughput_command=" $throughput_envs python3 benchmark_throughput.py \
|
||||||
--output-json $RESULTS_FOLDER/${test_name}.json \
|
--output-json $RESULTS_FOLDER/${test_name}.json \
|
||||||
$throughput_args"
|
$throughput_args"
|
||||||
|
|
||||||
@ -272,18 +320,27 @@ run_serving_tests() {
|
|||||||
|
|
||||||
# get client and server arguments
|
# get client and server arguments
|
||||||
server_params=$(echo "$params" | jq -r '.server_parameters')
|
server_params=$(echo "$params" | jq -r '.server_parameters')
|
||||||
|
server_envs=$(echo "$params" | jq -r '.server_environment_variables')
|
||||||
client_params=$(echo "$params" | jq -r '.client_parameters')
|
client_params=$(echo "$params" | jq -r '.client_parameters')
|
||||||
server_args=$(json2args "$server_params")
|
server_args=$(json2args "$server_params")
|
||||||
|
server_envs=$(json2envs "$server_envs")
|
||||||
client_args=$(json2args "$client_params")
|
client_args=$(json2args "$client_params")
|
||||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||||
echo "Running over qps list $qps_list"
|
echo "Running over qps list $qps_list"
|
||||||
|
|
||||||
# check if there is enough GPU to run the test
|
# check if there is enough resources to run the test
|
||||||
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
|
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
|
||||||
if [[ $gpu_count -lt $tp ]]; then
|
if [ "$ON_CPU" == "1" ];then
|
||||||
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
if [[ $numa_count -lt $tp ]]; then
|
||||||
continue
|
echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
|
echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# check if server model and client model is aligned
|
# check if server model and client model is aligned
|
||||||
@ -294,23 +351,33 @@ run_serving_tests() {
|
|||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
server_command="python3 \
|
server_command="$server_envs python3 \
|
||||||
-m vllm.entrypoints.openai.api_server \
|
-m vllm.entrypoints.openai.api_server \
|
||||||
$server_args"
|
$server_args"
|
||||||
|
|
||||||
# run the server
|
# run the server
|
||||||
echo "Running test case $test_name"
|
echo "Running test case $test_name"
|
||||||
echo "Server command: $server_command"
|
echo "Server command: $server_command"
|
||||||
bash -c "$server_command" &
|
# support remote vllm server
|
||||||
server_pid=$!
|
client_remote_args=""
|
||||||
|
if [[ -z "${REMOTE_HOST}" ]]; then
|
||||||
# wait until the server is alive
|
bash -c "$server_command" &
|
||||||
if wait_for_server; then
|
server_pid=$!
|
||||||
echo ""
|
# wait until the server is alive
|
||||||
echo "vllm server is up and running."
|
if wait_for_server; then
|
||||||
|
echo ""
|
||||||
|
echo "vLLM server is up and running."
|
||||||
|
else
|
||||||
|
echo ""
|
||||||
|
echo "vLLM failed to start within the timeout period."
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
echo ""
|
server_command="Using Remote Server $REMOTE_HOST $REMOTE_PORT"
|
||||||
echo "vllm failed to start within the timeout period."
|
if [[ ${REMOTE_PORT} ]]; then
|
||||||
|
client_remote_args=" --host=$REMOTE_HOST --port=$REMOTE_PORT "
|
||||||
|
else
|
||||||
|
client_remote_args=" --host=$REMOTE_HOST "
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# iterate over different QPS
|
# iterate over different QPS
|
||||||
@ -332,7 +399,7 @@ run_serving_tests() {
|
|||||||
--result-filename ${new_test_name}.json \
|
--result-filename ${new_test_name}.json \
|
||||||
--request-rate $qps \
|
--request-rate $qps \
|
||||||
--metadata "tensor_parallel_size=$tp" \
|
--metadata "tensor_parallel_size=$tp" \
|
||||||
$client_args"
|
$client_args $client_remote_args "
|
||||||
|
|
||||||
echo "Running test case $test_name with qps $qps"
|
echo "Running test case $test_name with qps $qps"
|
||||||
echo "Client command: $client_command"
|
echo "Client command: $client_command"
|
||||||
@ -360,7 +427,14 @@ run_serving_tests() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
main() {
|
main() {
|
||||||
check_gpus
|
local ARCH
|
||||||
|
ARCH=''
|
||||||
|
if [ "$ON_CPU" == "1" ];then
|
||||||
|
check_cpus
|
||||||
|
ARCH='-cpu'
|
||||||
|
else
|
||||||
|
check_gpus
|
||||||
|
fi
|
||||||
check_hf_token
|
check_hf_token
|
||||||
|
|
||||||
# Set to v1 to run v1 benchmark
|
# Set to v1 to run v1 benchmark
|
||||||
@ -386,9 +460,9 @@ main() {
|
|||||||
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
||||||
|
|
||||||
# benchmarking
|
# benchmarking
|
||||||
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/serving-tests.json
|
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
|
||||||
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/latency-tests.json
|
run_latency_tests $QUICK_BENCHMARK_ROOT/tests/"${LATENCY_JSON:-latency-tests$ARCH.json}"
|
||||||
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/throughput-tests.json
|
run_throughput_tests $QUICK_BENCHMARK_ROOT/tests/"${THROUGHPUT_JSON:-throughput-tests$ARCH.json}"
|
||||||
|
|
||||||
# postprocess benchmarking results
|
# postprocess benchmarking results
|
||||||
pip install tabulate pandas
|
pip install tabulate pandas
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
@ -34,10 +35,8 @@ serving_column_mapping = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
||||||
# collect results
|
# collect results
|
||||||
for test_file in results_folder.glob("*.json"):
|
for test_file in results_folder.glob("*.json"):
|
||||||
|
|
||||||
with open(test_file) as f:
|
with open(test_file) as f:
|
||||||
raw_result = json.loads(f.read())
|
raw_result = json.loads(f.read())
|
||||||
|
|
||||||
@ -56,17 +55,16 @@ if __name__ == "__main__":
|
|||||||
serving_results = pd.DataFrame.from_dict(serving_results)
|
serving_results = pd.DataFrame.from_dict(serving_results)
|
||||||
|
|
||||||
if not serving_results.empty:
|
if not serving_results.empty:
|
||||||
serving_results = serving_results[list(
|
serving_results = serving_results[list(serving_column_mapping.keys())].rename(
|
||||||
serving_column_mapping.keys())].rename(
|
columns=serving_column_mapping
|
||||||
columns=serving_column_mapping)
|
)
|
||||||
|
|
||||||
serving_md_table_with_headers = tabulate(serving_results,
|
serving_md_table_with_headers = tabulate(
|
||||||
headers='keys',
|
serving_results, headers="keys", tablefmt="pipe", showindex=False
|
||||||
tablefmt='pipe',
|
)
|
||||||
showindex=False)
|
|
||||||
# remove the first line of header
|
# remove the first line of header
|
||||||
serving_md_table_lines = serving_md_table_with_headers.split('\n')
|
serving_md_table_lines = serving_md_table_with_headers.split("\n")
|
||||||
serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
|
serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
|
||||||
|
|
||||||
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
||||||
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
|
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
|
||||||
@ -76,10 +74,9 @@ if __name__ == "__main__":
|
|||||||
# document results with header.
|
# document results with header.
|
||||||
# for those who wants to reproduce our benchmark.
|
# for those who wants to reproduce our benchmark.
|
||||||
f.write(serving_md_table_with_headers)
|
f.write(serving_md_table_with_headers)
|
||||||
f.write('\n')
|
f.write("\n")
|
||||||
|
|
||||||
# document benchmarking results in json
|
# document benchmarking results in json
|
||||||
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
|
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
|
||||||
|
results = serving_results.to_dict(orient="records")
|
||||||
results = serving_results.to_dict(orient='records')
|
|
||||||
f.write(json.dumps(results))
|
f.write(json.dumps(results))
|
||||||
|
|||||||
30
.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
Normal file
30
.buildkite/nightly-benchmarks/tests/latency-tests-cpu.json
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "latency_llama8B_tp1",
|
||||||
|
"environment_variables": {
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"num_iters_warmup": 5,
|
||||||
|
"num_iters": 15
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "latency_llama8B_tp4",
|
||||||
|
"environment_variables": {
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"num_iters_warmup": 5,
|
||||||
|
"num_iters": 15
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
158
.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
Normal file
158
.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json
Normal file
@ -0,0 +1,158 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"max_concurrency": 60,
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"max_concurrency": 60,
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp4_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"max_concurrency": 60,
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp4_random_1024_128",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 1024,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"max_concurrency": 100,
|
||||||
|
"num_prompts": 100
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_pp6_random_1024_128",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"pipeline_parallel_size": 6,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 1024,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"max_concurrency": 100,
|
||||||
|
"num_prompts": 100
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
@ -0,0 +1,32 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "throughput_llama8B_tp1",
|
||||||
|
"environment_variables": {
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200,
|
||||||
|
"backend": "vllm"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "throughput_llama8B_tp4",
|
||||||
|
"environment_variables": {
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200,
|
||||||
|
"backend": "vllm"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
46
.buildkite/pyproject.toml
Normal file
46
.buildkite/pyproject.toml
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
# This local pyproject file is part of the migration from yapf to ruff format.
|
||||||
|
# It uses the same core rules as the main pyproject.toml file, but with the
|
||||||
|
# following differences:
|
||||||
|
# - ruff line length is overridden to 88
|
||||||
|
# - deprecated typing ignores (UP006, UP035) have been removed
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
line-length = 88
|
||||||
|
|
||||||
|
[tool.ruff.lint.per-file-ignores]
|
||||||
|
"vllm/third_party/**" = ["ALL"]
|
||||||
|
"vllm/version.py" = ["F401"]
|
||||||
|
"vllm/_version.py" = ["ALL"]
|
||||||
|
|
||||||
|
[tool.ruff.lint]
|
||||||
|
select = [
|
||||||
|
# pycodestyle
|
||||||
|
"E",
|
||||||
|
# Pyflakes
|
||||||
|
"F",
|
||||||
|
# pyupgrade
|
||||||
|
"UP",
|
||||||
|
# flake8-bugbear
|
||||||
|
"B",
|
||||||
|
# flake8-simplify
|
||||||
|
"SIM",
|
||||||
|
# isort
|
||||||
|
"I",
|
||||||
|
# flake8-logging-format
|
||||||
|
"G",
|
||||||
|
]
|
||||||
|
ignore = [
|
||||||
|
# star imports
|
||||||
|
"F405", "F403",
|
||||||
|
# lambda expression assignment
|
||||||
|
"E731",
|
||||||
|
# Loop control variable not used within loop body
|
||||||
|
"B007",
|
||||||
|
# f-string format
|
||||||
|
"UP032",
|
||||||
|
# Can remove once 3.10+ is the minimum Python version
|
||||||
|
"UP007",
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.ruff.format]
|
||||||
|
docstring-code-format = true
|
||||||
@ -1,20 +1,22 @@
|
|||||||
steps:
|
steps:
|
||||||
- label: "Build wheel - CUDA 12.4"
|
- label: "Build wheel - CUDA 12.8"
|
||||||
|
id: build-wheel-cuda-12-8
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build wheel - CUDA 12.1"
|
- label: "Build wheel - CUDA 12.6"
|
||||||
|
id: build-wheel-cuda-12-6
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
@ -28,10 +30,11 @@ steps:
|
|||||||
|
|
||||||
- label: "Build wheel - CUDA 11.8"
|
- label: "Build wheel - CUDA 11.8"
|
||||||
# depends_on: block-build-cu118-wheel
|
# depends_on: block-build-cu118-wheel
|
||||||
|
id: build-wheel-cuda-11-8
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
@ -44,33 +47,49 @@ steps:
|
|||||||
|
|
||||||
- label: "Build release image"
|
- label: "Build release image"
|
||||||
depends_on: block-release-image-build
|
depends_on: block-release-image-build
|
||||||
|
id: build-release-image
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
|
- label: "Annotate release workflow"
|
||||||
|
depends_on:
|
||||||
|
- build-release-image
|
||||||
|
- build-wheel-cuda-12-8
|
||||||
|
- build-wheel-cuda-12-6
|
||||||
|
- build-wheel-cuda-11-8
|
||||||
|
id: annotate-release-workflow
|
||||||
|
agents:
|
||||||
|
queue: cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "bash .buildkite/scripts/annotate-release.sh"
|
||||||
|
|
||||||
- label: "Build and publish TPU release image"
|
- label: "Build and publish TPU release image"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
if: build.env("NIGHTLY") == "1"
|
if: build.env("NIGHTLY") == "1"
|
||||||
agents:
|
agents:
|
||||||
queue: tpu_queue_postmerge
|
queue: tpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
|
- "yes | docker system prune -a"
|
||||||
|
- "git fetch --all"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
|
||||||
- "docker push vllm/vllm-tpu:nightly"
|
- "docker push vllm/vllm-tpu:nightly"
|
||||||
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
||||||
plugins:
|
plugins:
|
||||||
- docker-login#v3.0.0:
|
- docker-login#v3.0.0:
|
||||||
username: vllm
|
username: vllmbot
|
||||||
password-env: DOCKERHUB_TOKEN
|
password-env: DOCKERHUB_TOKEN
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- input: "Provide Release version here"
|
- input: "Provide Release version here"
|
||||||
|
id: input-release-version
|
||||||
fields:
|
fields:
|
||||||
- text: "What is the release version?"
|
- text: "What is the release version?"
|
||||||
key: "release-version"
|
key: release-version
|
||||||
|
|
||||||
- block: "Build CPU release image"
|
- block: "Build CPU release image"
|
||||||
key: block-cpu-release-image-build
|
key: block-cpu-release-image-build
|
||||||
@ -82,7 +101,8 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_CPU_AVX512BF16=true --build-arg VLLM_CPU_AVX512VNNI=true --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||||
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
@ -98,6 +118,7 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
|
||||||
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|||||||
31
.buildkite/scripts/annotate-release.sh
Executable file
31
.buildkite/scripts/annotate-release.sh
Executable file
@ -0,0 +1,31 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Get release version and strip leading 'v' if present
|
||||||
|
RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
|
||||||
|
|
||||||
|
if [ -z "$RELEASE_VERSION" ]; then
|
||||||
|
echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
|
||||||
|
To download the wheel:
|
||||||
|
\`\`\`
|
||||||
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
|
\`\`\`
|
||||||
|
|
||||||
|
To download and upload the image:
|
||||||
|
|
||||||
|
\`\`\`
|
||||||
|
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
|
||||||
|
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
|
||||||
|
docker tag vllm/vllm-openai vllm/vllm-openai:latest
|
||||||
|
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
|
||||||
|
docker push vllm/vllm-openai:latest
|
||||||
|
docker push vllm/vllm-openai:v${RELEASE_VERSION}
|
||||||
|
\`\`\`
|
||||||
|
EOF
|
||||||
17
.buildkite/scripts/ci-clean-log.sh
Normal file
17
.buildkite/scripts/ci-clean-log.sh
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# Usage: ./ci_clean_log.sh ci.log
|
||||||
|
# This script strips timestamps and color codes from CI log files.
|
||||||
|
|
||||||
|
# Check if argument is given
|
||||||
|
if [ $# -lt 1 ]; then
|
||||||
|
echo "Usage: $0 ci.log"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
INPUT_FILE="$1"
|
||||||
|
|
||||||
|
# Strip timestamps
|
||||||
|
sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
|
||||||
|
|
||||||
|
# Strip colorization
|
||||||
|
sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"
|
||||||
@ -3,6 +3,9 @@
|
|||||||
# This script runs test inside the corresponding ROCm docker container.
|
# This script runs test inside the corresponding ROCm docker container.
|
||||||
set -o pipefail
|
set -o pipefail
|
||||||
|
|
||||||
|
# Export Python path
|
||||||
|
export PYTHONPATH=".."
|
||||||
|
|
||||||
# Print ROCm version
|
# Print ROCm version
|
||||||
echo "--- Confirming Clean Initial State"
|
echo "--- Confirming Clean Initial State"
|
||||||
while true; do
|
while true; do
|
||||||
@ -74,38 +77,72 @@ HF_MOUNT="/root/.cache/huggingface"
|
|||||||
|
|
||||||
commands=$@
|
commands=$@
|
||||||
echo "Commands:$commands"
|
echo "Commands:$commands"
|
||||||
|
|
||||||
|
if [[ $commands == *"pytest -v -s basic_correctness/test_basic_correctness.py"* ]]; then
|
||||||
|
commands=${commands//"pytest -v -s basic_correctness/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s basic_correctness/test_basic_correctness.py"}
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
|
||||||
|
commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
|
||||||
|
commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
|
||||||
|
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $commands == *"pytest -v -s lora"* ]]; then
|
||||||
|
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
|
||||||
|
fi
|
||||||
|
|
||||||
#ignore certain kernels tests
|
#ignore certain kernels tests
|
||||||
if [[ $commands == *" kernels "* ]]; then
|
if [[ $commands == *" kernels/core"* ]]; then
|
||||||
commands="${commands} \
|
commands="${commands} \
|
||||||
--ignore=kernels/test_attention_selector.py \
|
--ignore=kernels/core/test_fused_quant_layernorm.py \
|
||||||
--ignore=kernels/test_blocksparse_attention.py \
|
--ignore=kernels/core/test_permute_cols.py"
|
||||||
--ignore=kernels/test_causal_conv1d.py \
|
fi
|
||||||
--ignore=kernels/test_cutlass.py \
|
|
||||||
--ignore=kernels/test_encoder_decoder_attn.py \
|
if [[ $commands == *" kernels/attention"* ]]; then
|
||||||
--ignore=kernels/test_flash_attn.py \
|
commands="${commands} \
|
||||||
--ignore=kernels/test_flashinfer.py \
|
--ignore=kernels/attention/test_attention_selector.py \
|
||||||
--ignore=kernels/test_int8_quant.py \
|
--ignore=kernels/attention/test_blocksparse_attention.py \
|
||||||
--ignore=kernels/test_machete_gemm.py \
|
--ignore=kernels/attention/test_encoder_decoder_attn.py \
|
||||||
--ignore=kernels/test_mamba_ssm.py \
|
--ignore=kernels/attention/test_flash_attn.py \
|
||||||
--ignore=kernels/test_marlin_gemm.py \
|
--ignore=kernels/attention/test_flashinfer.py \
|
||||||
--ignore=kernels/test_moe.py \
|
--ignore=kernels/attention/test_prefix_prefill.py \
|
||||||
--ignore=kernels/test_prefix_prefill.py \
|
--ignore=kernels/attention/test_cascade_flash_attn.py \
|
||||||
--ignore=kernels/test_rand.py \
|
--ignore=kernels/attention/test_mha_attn.py \
|
||||||
--ignore=kernels/test_sampler.py \
|
--ignore=kernels/attention/test_lightning_attn.py \
|
||||||
--ignore=kernels/test_cascade_flash_attn.py \
|
--ignore=kernels/attention/test_attention.py"
|
||||||
--ignore=kernels/test_mamba_mixer2.py \
|
fi
|
||||||
--ignore=kernels/test_aqlm.py \
|
|
||||||
--ignore=kernels/test_machete_mm.py \
|
if [[ $commands == *" kernels/quantization"* ]]; then
|
||||||
--ignore=kernels/test_mha_attn.py \
|
commands="${commands} \
|
||||||
--ignore=kernels/test_block_fp8.py \
|
--ignore=kernels/quantization/test_int8_quant.py \
|
||||||
--ignore=kernels/test_cutlass_moe.py \
|
--ignore=kernels/quantization/test_aqlm.py \
|
||||||
--ignore=kernels/test_mamba_ssm_ssd.py \
|
--ignore=kernels/quantization/test_machete_mm.py \
|
||||||
--ignore=kernels/test_attention.py \
|
--ignore=kernels/quantization/test_block_fp8.py \
|
||||||
--ignore=kernels/test_block_int8.py \
|
--ignore=kernels/quantization/test_block_int8.py \
|
||||||
--ignore=kernels/test_fused_quant_layernorm.py \
|
--ignore=kernels/quantization/test_marlin_gemm.py \
|
||||||
--ignore=kernels/test_int8_kernel.py \
|
--ignore=kernels/quantization/test_cutlass_scaled_mm.py \
|
||||||
--ignore=kernels/test_triton_moe_ptpc_fp8.py \
|
--ignore=kernels/quantization/test_int8_kernel.py"
|
||||||
--ignore=kernels/test_permute_cols.py"
|
fi
|
||||||
|
|
||||||
|
if [[ $commands == *" kernels/mamba"* ]]; then
|
||||||
|
commands="${commands} \
|
||||||
|
--ignore=kernels/mamba/test_mamba_mixer2.py \
|
||||||
|
--ignore=kernels/mamba/test_causal_conv1d.py \
|
||||||
|
--ignore=kernels/mamba/test_mamba_ssm_ssd.py"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $commands == *" kernels/moe"* ]]; then
|
||||||
|
commands="${commands} \
|
||||||
|
--ignore=kernels/moe/test_moe.py \
|
||||||
|
--ignore=kernels/moe/test_cutlass_moe.py \
|
||||||
|
--ignore=kernels/moe/test_triton_moe_ptpc_fp8.py"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
#ignore certain Entrypoints/openai tests
|
#ignore certain Entrypoints/openai tests
|
||||||
@ -147,6 +184,8 @@ fi
|
|||||||
|
|
||||||
|
|
||||||
PARALLEL_JOB_COUNT=8
|
PARALLEL_JOB_COUNT=8
|
||||||
|
MYPYTHONPATH=".."
|
||||||
|
|
||||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||||
if [[ $commands == *"--shard-id="* ]]; then
|
if [[ $commands == *"--shard-id="* ]]; then
|
||||||
# assign job count as the number of shards used
|
# assign job count as the number of shards used
|
||||||
@ -167,6 +206,7 @@ if [[ $commands == *"--shard-id="* ]]; then
|
|||||||
-e AWS_SECRET_ACCESS_KEY \
|
-e AWS_SECRET_ACCESS_KEY \
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
|
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
||||||
--name "${container_name}_${GPU}" \
|
--name "${container_name}_${GPU}" \
|
||||||
"${image_name}" \
|
"${image_name}" \
|
||||||
/bin/bash -c "${commands_gpu}" \
|
/bin/bash -c "${commands_gpu}" \
|
||||||
@ -197,6 +237,7 @@ else
|
|||||||
-e AWS_SECRET_ACCESS_KEY \
|
-e AWS_SECRET_ACCESS_KEY \
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
|
-e "PYTHONPATH=${MYPYTHONPATH}" \
|
||||||
--name "${container_name}" \
|
--name "${container_name}" \
|
||||||
"${image_name}" \
|
"${image_name}" \
|
||||||
/bin/bash -c "${commands}"
|
/bin/bash -c "${commands}"
|
||||||
|
|||||||
@ -5,7 +5,13 @@
|
|||||||
set -ex
|
set -ex
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() { podman rm -f cpu-test-ubi9-ppc || true; podman system prune -f; }
|
remove_docker_container() {
|
||||||
|
if [[ -n "$container_id" ]]; then
|
||||||
|
podman stop --all -t0
|
||||||
|
podman rm -f "$container_id" || true
|
||||||
|
fi
|
||||||
|
podman system prune -f
|
||||||
|
}
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
@ -13,26 +19,31 @@ remove_docker_container
|
|||||||
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
|
podman build -t cpu-test-ubi9-ppc -f docker/Dockerfile.ppc64le .
|
||||||
|
|
||||||
# Run the image
|
# Run the image
|
||||||
podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --name cpu-test-ubi9-ppc cpu-test-ubi9-ppc
|
container_id=$(podman run -itd --entrypoint /bin/bash -v /tmp/:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN cpu-test-ubi9-ppc)
|
||||||
|
|
||||||
function cpu_tests() {
|
function cpu_tests() {
|
||||||
|
|
||||||
# offline inference
|
# offline inference
|
||||||
podman exec cpu-test-ubi9-ppc bash -c "
|
podman exec -it "$container_id" bash -c "
|
||||||
set -e
|
set -e
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
podman exec cpu-test-ubi9-ppc bash -c "
|
podman exec -it "$container_id" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
||||||
pip install sentence-transformers datamodel_code_generator
|
pip install sentence-transformers datamodel_code_generator
|
||||||
pytest -v -s tests/models/embedding/language/test_cls_models.py::test_classification_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
||||||
pytest -v -s tests/models/embedding/language/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
||||||
pytest -v -s tests/models/encoder_decoder/language -m cpu_model"
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
||||||
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
||||||
|
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
||||||
|
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
|
||||||
}
|
}
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
# All of CPU tests are expected to be finished less than 40 mins.
|
||||||
|
|
||||||
|
export container_id
|
||||||
export -f cpu_tests
|
export -f cpu_tests
|
||||||
timeout 40m bash -c cpu_tests
|
timeout 40m bash -c cpu_tests
|
||||||
|
|
||||||
|
|||||||
@ -6,75 +6,83 @@ set -ex
|
|||||||
|
|
||||||
# allow to bind to different cores
|
# allow to bind to different cores
|
||||||
CORE_RANGE=${CORE_RANGE:-48-95}
|
CORE_RANGE=${CORE_RANGE:-48-95}
|
||||||
|
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
|
||||||
NUMA_NODE=${NUMA_NODE:-1}
|
NUMA_NODE=${NUMA_NODE:-1}
|
||||||
|
|
||||||
|
export CMAKE_BUILD_PARALLEL_LEVEL=32
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
set -e;
|
set -e;
|
||||||
docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true;
|
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
|
||||||
docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true;
|
|
||||||
}
|
}
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||||
|
|
||||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
||||||
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
|
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
|
||||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
|
||||||
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
|
|
||||||
|
|
||||||
function cpu_tests() {
|
function cpu_tests() {
|
||||||
set -e
|
set -e
|
||||||
export NUMA_NODE=$2
|
export NUMA_NODE=$2
|
||||||
export BUILDKITE_BUILD_NUMBER=$3
|
|
||||||
|
# list packages
|
||||||
|
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
|
||||||
|
set -e
|
||||||
|
pip list"
|
||||||
|
|
||||||
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
|
set -e
|
||||||
|
pip list"
|
||||||
|
|
||||||
# offline inference
|
# offline inference
|
||||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
|
||||||
set -e
|
set -e
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -v -s tests/kernels/test_cache.py -m cpu_model
|
# Note: disable until supports V1
|
||||||
pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
|
# pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
||||||
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
# pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
||||||
pytest -v -s tests/models/embedding/language -m cpu_model
|
|
||||||
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
|
# Note: disable Bart until supports V1
|
||||||
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
|
pytest -v -s tests/models/language/generation -m cpu_model \
|
||||||
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
|
--ignore=tests/models/language/generation/test_bart.py
|
||||||
|
VLLM_CPU_SGL_KERNEL=1 pytest -v -s tests/models/language/generation -m cpu_model \
|
||||||
|
--ignore=tests/models/language/generation/test_bart.py
|
||||||
|
|
||||||
|
pytest -v -s tests/models/language/pooling -m cpu_model
|
||||||
|
pytest -v -s tests/models/multimodal/generation \
|
||||||
|
--ignore=tests/models/multimodal/generation/test_mllama.py \
|
||||||
|
--ignore=tests/models/multimodal/generation/test_pixtral.py \
|
||||||
|
-m cpu_model"
|
||||||
|
|
||||||
# Run compressed-tensor test
|
# Run compressed-tensor test
|
||||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -s -v \
|
pytest -s -v \
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
|
|
||||||
|
|
||||||
|
# Note: disable it until supports V1
|
||||||
# Run AWQ test
|
# Run AWQ test
|
||||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
# docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
# set -e
|
||||||
pytest -s -v \
|
# VLLM_USE_V1=0 pytest -s -v \
|
||||||
tests/quantization/test_ipex_quant.py"
|
# tests/quantization/test_ipex_quant.py"
|
||||||
|
|
||||||
# Run chunked-prefill and prefix-cache test
|
|
||||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
|
||||||
set -e
|
|
||||||
pytest -s -v -k cpu_model \
|
|
||||||
tests/basic_correctness/test_chunked_prefill.py"
|
|
||||||
|
|
||||||
# online serving
|
# online serving
|
||||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
export VLLM_CPU_KVCACHE_SPACE=10
|
|
||||||
export VLLM_CPU_OMP_THREADS_BIND=$1
|
|
||||||
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
|
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
|
||||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||||
python3 benchmarks/benchmark_serving.py \
|
VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \
|
||||||
--backend vllm \
|
--backend vllm \
|
||||||
--dataset-name random \
|
--dataset-name random \
|
||||||
--model facebook/opt-125m \
|
--model facebook/opt-125m \
|
||||||
@ -83,7 +91,7 @@ function cpu_tests() {
|
|||||||
--tokenizer facebook/opt-125m"
|
--tokenizer facebook/opt-125m"
|
||||||
|
|
||||||
# Run multi-lora tests
|
# Run multi-lora tests
|
||||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -s -v \
|
pytest -s -v \
|
||||||
tests/lora/test_qwen2vl.py"
|
tests/lora/test_qwen2vl.py"
|
||||||
@ -91,4 +99,4 @@ function cpu_tests() {
|
|||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
# All of CPU tests are expected to be finished less than 40 mins.
|
||||||
export -f cpu_tests
|
export -f cpu_tests
|
||||||
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
|
timeout 1.5h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
||||||
|
|||||||
@ -2,23 +2,57 @@
|
|||||||
|
|
||||||
# This script build the CPU docker image and run the offline inference inside the container.
|
# This script build the CPU docker image and run the offline inference inside the container.
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
set -ex
|
set -exuo pipefail
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
docker build -t hpu-test-env -f docker/Dockerfile.hpu .
|
cat <<EOF | docker build -t hpu-plugin-v1-test-env -f - .
|
||||||
|
FROM 1.22-413-pt2.7.1:latest
|
||||||
|
|
||||||
|
COPY ./ /workspace/vllm
|
||||||
|
|
||||||
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
|
RUN pip install -v -r requirements/hpu.txt
|
||||||
|
RUN pip install git+https://github.com/vllm-project/vllm-gaudi.git
|
||||||
|
|
||||||
|
ENV no_proxy=localhost,127.0.0.1
|
||||||
|
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true
|
||||||
|
|
||||||
|
RUN VLLM_TARGET_DEVICE=hpu python3 setup.py install
|
||||||
|
|
||||||
|
# install development dependencies (for testing)
|
||||||
|
RUN python3 -m pip install -e tests/vllm_test_utils
|
||||||
|
|
||||||
|
WORKDIR /workspace/
|
||||||
|
|
||||||
|
RUN git clone https://github.com/vllm-project/vllm-gaudi.git
|
||||||
|
|
||||||
|
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||||
|
|
||||||
|
EOF
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
# certain versions of HPU software stack have a bug that can
|
# certain versions of HPU software stack have a bug that can
|
||||||
# override the exit code of the script, so we need to use
|
# override the exit code of the script, so we need to use
|
||||||
# separate remove_docker_container and remove_docker_container_and_exit
|
# separate remove_docker_containers and remove_docker_containers_and_exit
|
||||||
# functions, while other platforms only need one remove_docker_container
|
# functions, while other platforms only need one remove_docker_container
|
||||||
# function.
|
# function.
|
||||||
EXITCODE=1
|
EXITCODE=1
|
||||||
remove_docker_container() { docker rm -f hpu-test || true; }
|
remove_docker_containers() { docker rm -f hpu-plugin-v1-test || true; }
|
||||||
remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
|
trap 'remove_docker_containers; exit $EXITCODE;' EXIT
|
||||||
trap remove_docker_container_and_exit EXIT
|
remove_docker_containers
|
||||||
remove_docker_container
|
|
||||||
|
echo "Running HPU plugin v1 test"
|
||||||
|
docker run --rm --runtime=habana --name=hpu-plugin-v1-test --network=host \
|
||||||
|
-e HABANA_VISIBLE_DEVICES=all \
|
||||||
|
hpu-plugin-v1-test-env \
|
||||||
|
/bin/bash "/workspace/vllm-gaudi/tests/upstream_tests/ci_tests.sh"
|
||||||
|
|
||||||
# Run the image and launch offline inference
|
|
||||||
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
|
||||||
EXITCODE=$?
|
EXITCODE=$?
|
||||||
|
if [ $EXITCODE -eq 0 ]; then
|
||||||
|
echo "Test with basic model passed"
|
||||||
|
else
|
||||||
|
echo "Test with basic model FAILED with exit code: $EXITCODE" >&2
|
||||||
|
fi
|
||||||
|
|
||||||
|
# The trap will handle the container removal and final exit.
|
||||||
@ -11,13 +11,14 @@ container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
|||||||
HF_CACHE="$(realpath ~)/huggingface"
|
HF_CACHE="$(realpath ~)/huggingface"
|
||||||
mkdir -p "${HF_CACHE}"
|
mkdir -p "${HF_CACHE}"
|
||||||
HF_MOUNT="/root/.cache/huggingface"
|
HF_MOUNT="/root/.cache/huggingface"
|
||||||
|
HF_TOKEN=$(aws secretsmanager get-secret-value --secret-id "ci/vllm-neuron/hf-token" --region us-west-2 --query 'SecretString' --output text | jq -r .VLLM_NEURON_CI_HF_TOKEN)
|
||||||
|
|
||||||
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
|
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
|
||||||
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
|
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
|
||||||
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
|
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
|
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws
|
||||||
|
|
||||||
# prune old image and containers to save disk space, and only once a day
|
# prune old image and containers to save disk space, and only once a day
|
||||||
# by using a timestamp file in tmp.
|
# by using a timestamp file in tmp.
|
||||||
@ -47,8 +48,17 @@ trap remove_docker_container EXIT
|
|||||||
docker run --rm -it --device=/dev/neuron0 --network bridge \
|
docker run --rm -it --device=/dev/neuron0 --network bridge \
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
|
-e "HF_TOKEN=${HF_TOKEN}" \
|
||||||
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
|
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||||
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||||
--name "${container_name}" \
|
--name "${container_name}" \
|
||||||
${image_name} \
|
${image_name} \
|
||||||
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"
|
/bin/bash -c "
|
||||||
|
set -e; # Exit on first error
|
||||||
|
python3 /workspace/vllm/examples/offline_inference/neuron.py;
|
||||||
|
python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
|
||||||
|
for f in /workspace/vllm/tests/neuron/2_core/*.py; do
|
||||||
|
echo \"Running test file: \$f\";
|
||||||
|
python3 -m pytest \$f -v --capture=tee-sys;
|
||||||
|
done
|
||||||
|
"
|
||||||
@ -1,53 +1,187 @@
|
|||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
set -xue
|
set -xu
|
||||||
|
|
||||||
|
|
||||||
|
remove_docker_container() {
|
||||||
|
docker rm -f tpu-test || true;
|
||||||
|
docker rm -f vllm-tpu || true;
|
||||||
|
}
|
||||||
|
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
|
||||||
|
# Remove the container that might not be cleaned up in the previous run.
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
# Build the docker image.
|
# Build the docker image.
|
||||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
||||||
|
|
||||||
# Set up cleanup.
|
# Set up cleanup.
|
||||||
remove_docker_container() { docker rm -f tpu-test || true; }
|
cleanup_docker() {
|
||||||
trap remove_docker_container EXIT
|
# Get Docker's root directory
|
||||||
# Remove the container that might not be cleaned up in the previous run.
|
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||||
remove_docker_container
|
if [ -z "$docker_root" ]; then
|
||||||
|
echo "Failed to determine Docker root directory."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Docker root directory: $docker_root"
|
||||||
|
# Check disk usage of the filesystem where Docker's root directory is located
|
||||||
|
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||||
|
# Define the threshold
|
||||||
|
threshold=70
|
||||||
|
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||||
|
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||||
|
# Remove dangling images (those that are not tagged and not used by any container)
|
||||||
|
docker image prune -f
|
||||||
|
# Remove unused volumes / force the system prune for old images as well.
|
||||||
|
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||||
|
echo "Docker images and volumes cleanup completed."
|
||||||
|
else
|
||||||
|
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
cleanup_docker
|
||||||
|
|
||||||
# For HF_TOKEN.
|
# For HF_TOKEN.
|
||||||
source /etc/environment
|
source /etc/environment
|
||||||
# Run a simple end-to-end example.
|
|
||||||
docker run --privileged --net host --shm-size=16G -it \
|
docker run --privileged --net host --shm-size=16G -it \
|
||||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
||||||
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
vllm-tpu /bin/bash -c '
|
||||||
&& python3 -m pip install pytest pytest-asyncio tpu-info \
|
set -e # Exit immediately if a command exits with a non-zero status.
|
||||||
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
set -u # Treat unset variables as an error.
|
||||||
&& export VLLM_USE_V1=1 \
|
|
||||||
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
|
|
||||||
&& echo HARDWARE \
|
|
||||||
&& tpu-info \
|
|
||||||
&& echo TEST_0 \
|
|
||||||
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py \
|
|
||||||
&& echo TEST_1 \
|
|
||||||
&& pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \
|
|
||||||
&& echo TEST_2 \
|
|
||||||
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
|
|
||||||
&& echo TEST_3 \
|
|
||||||
&& pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
|
|
||||||
&& echo TEST_4 \
|
|
||||||
&& pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
|
|
||||||
&& echo TEST_5 \
|
|
||||||
&& python3 /workspace/vllm/examples/offline_inference/tpu.py \
|
|
||||||
&& echo TEST_6 \
|
|
||||||
&& pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
|
|
||||||
&& echo TEST_7 \
|
|
||||||
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py \
|
|
||||||
&& echo TEST_8 \
|
|
||||||
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py \
|
|
||||||
&& echo TEST_9 \
|
|
||||||
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py \
|
|
||||||
&& echo TEST_10 \
|
|
||||||
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py \
|
|
||||||
&& echo TEST_11 \
|
|
||||||
&& pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py" \
|
|
||||||
|
|
||||||
|
echo "--- Starting script inside Docker container ---"
|
||||||
|
|
||||||
|
# Create results directory
|
||||||
|
RESULTS_DIR=$(mktemp -d)
|
||||||
|
# If mktemp fails, set -e will cause the script to exit.
|
||||||
|
echo "Results will be stored in: $RESULTS_DIR"
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
echo "--- Installing Python dependencies ---"
|
||||||
|
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||||
|
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||||
|
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
|
||||||
|
echo "--- Python dependencies installed ---"
|
||||||
|
export VLLM_USE_V1=1
|
||||||
|
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||||
|
export VLLM_XLA_CACHE_PATH=
|
||||||
|
echo "Using VLLM V1"
|
||||||
|
|
||||||
|
echo "--- Hardware Information ---"
|
||||||
|
tpu-info
|
||||||
|
echo "--- Starting Tests ---"
|
||||||
|
set +e
|
||||||
|
overall_script_exit_code=0
|
||||||
|
|
||||||
|
# --- Test Definitions ---
|
||||||
|
# If a test fails, this function will print logs and will not cause the main script to exit.
|
||||||
|
run_test() {
|
||||||
|
local test_num=$1
|
||||||
|
local test_name=$2
|
||||||
|
local test_command=$3
|
||||||
|
local log_file="$RESULTS_DIR/test_${test_num}.log"
|
||||||
|
local actual_exit_code
|
||||||
|
|
||||||
|
echo "--- TEST_$test_num: Running $test_name ---"
|
||||||
|
|
||||||
|
# Execute the test command.
|
||||||
|
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
|
||||||
|
actual_exit_code=$?
|
||||||
|
|
||||||
|
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
|
||||||
|
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
|
||||||
|
|
||||||
|
if [ "$actual_exit_code" -ne 0 ]; then
|
||||||
|
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
|
||||||
|
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
|
||||||
|
if [ -f "$log_file" ]; then
|
||||||
|
cat "$log_file" >&2
|
||||||
|
else
|
||||||
|
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
|
||||||
|
fi
|
||||||
|
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
|
||||||
|
return "$actual_exit_code" # Return the failure code
|
||||||
|
else
|
||||||
|
echo "TEST_$test_num ($test_name) PASSED."
|
||||||
|
return 0 # Return success
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Helper function to call run_test and update the overall script exit code
|
||||||
|
run_and_track_test() {
|
||||||
|
local test_num_arg="$1"
|
||||||
|
local test_name_arg="$2"
|
||||||
|
local test_command_arg="$3"
|
||||||
|
|
||||||
|
# Run the test
|
||||||
|
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
|
||||||
|
local test_specific_exit_code=$?
|
||||||
|
|
||||||
|
# If the test failed, set the overall script exit code to 1
|
||||||
|
if [ "$test_specific_exit_code" -ne 0 ]; then
|
||||||
|
# No need for extra echo here, run_test already logged the failure.
|
||||||
|
overall_script_exit_code=1
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# --- Actual Test Execution ---
|
||||||
|
run_and_track_test 0 "test_perf.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py"
|
||||||
|
run_and_track_test 1 "test_compilation.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py"
|
||||||
|
run_and_track_test 2 "test_basic.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
|
||||||
|
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
|
||||||
|
run_and_track_test 4 "test_quantization_accuracy.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
|
||||||
|
run_and_track_test 5 "examples/offline_inference/tpu.py" \
|
||||||
|
"python3 /workspace/vllm/examples/offline_inference/tpu.py"
|
||||||
|
run_and_track_test 6 "test_tpu_model_runner.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
|
||||||
|
run_and_track_test 7 "test_sampler.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py"
|
||||||
|
run_and_track_test 8 "test_topk_topp_sampler.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py"
|
||||||
|
run_and_track_test 9 "test_multimodal.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
|
||||||
|
run_and_track_test 10 "test_pallas.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
|
||||||
|
run_and_track_test 11 "test_struct_output_generate.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
|
||||||
|
run_and_track_test 12 "test_moe_pallas.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
|
||||||
|
run_and_track_test 13 "test_lora.py" \
|
||||||
|
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
|
||||||
|
run_and_track_test 14 "test_tpu_qkv_linear.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
|
||||||
|
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
|
||||||
|
run_and_track_test 16 "test_kv_cache_update_kernel.py" \
|
||||||
|
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
|
||||||
|
|
||||||
|
# After all tests have been attempted, exit with the overall status.
|
||||||
|
if [ "$overall_script_exit_code" -ne 0 ]; then
|
||||||
|
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
|
||||||
|
else
|
||||||
|
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
|
||||||
|
fi
|
||||||
|
exit "$overall_script_exit_code"
|
||||||
|
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
|
||||||
|
|
||||||
|
# Capture the exit code of the docker run command
|
||||||
|
DOCKER_RUN_EXIT_CODE=$?
|
||||||
|
|
||||||
|
# The trap will run for cleanup.
|
||||||
|
# Exit the main script with the Docker run command's exit code.
|
||||||
|
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
|
||||||
|
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
|
||||||
|
exit "$DOCKER_RUN_EXIT_CODE"
|
||||||
|
else
|
||||||
|
echo "Docker run command completed successfully."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
# TODO: This test fails because it uses RANDOM_SEED sampling
|
# TODO: This test fails because it uses RANDOM_SEED sampling
|
||||||
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||||
|
|||||||
@ -11,8 +11,8 @@ container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head
|
|||||||
docker build -t ${image_name} -f docker/Dockerfile.xpu .
|
docker build -t ${image_name} -f docker/Dockerfile.xpu .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() {
|
remove_docker_container() {
|
||||||
docker rm -f "${container_name}" || true;
|
docker rm -f "${container_name}" || true;
|
||||||
docker image rm -f "${image_name}" || true;
|
docker image rm -f "${image_name}" || true;
|
||||||
docker system prune -f || true;
|
docker system prune -f || true;
|
||||||
}
|
}
|
||||||
@ -26,6 +26,9 @@ docker run \
|
|||||||
--name "${container_name}" \
|
--name "${container_name}" \
|
||||||
"${image_name}" \
|
"${image_name}" \
|
||||||
sh -c '
|
sh -c '
|
||||||
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||||
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
|
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
|
||||||
|
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
|
||||||
|
cd tests
|
||||||
|
pytest -v -s v1/core
|
||||||
'
|
'
|
||||||
|
|||||||
18
.buildkite/scripts/rerun-test.sh
Normal file
18
.buildkite/scripts/rerun-test.sh
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Usage: ./rerun_test.sh path/to/test.py::test_name
|
||||||
|
|
||||||
|
# Check if argument is given
|
||||||
|
if [ $# -lt 1 ]; then
|
||||||
|
echo "Usage: $0 path/to/test.py::test_name"
|
||||||
|
echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
TEST=$1
|
||||||
|
COUNT=1
|
||||||
|
|
||||||
|
while pytest -sv "$TEST"; do
|
||||||
|
COUNT=$((COUNT + 1))
|
||||||
|
echo "RUN NUMBER ${COUNT}"
|
||||||
|
done
|
||||||
24
.buildkite/scripts/tpu/cleanup_docker.sh
Executable file
24
.buildkite/scripts/tpu/cleanup_docker.sh
Executable file
@ -0,0 +1,24 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||||
|
if [ -z "$docker_root" ]; then
|
||||||
|
echo "Failed to determine Docker root directory."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Docker root directory: $docker_root"
|
||||||
|
# Check disk usage of the filesystem where Docker's root directory is located
|
||||||
|
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||||
|
# Define the threshold
|
||||||
|
threshold=70
|
||||||
|
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||||
|
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||||
|
# Remove dangling images (those that are not tagged and not used by any container)
|
||||||
|
docker image prune -f
|
||||||
|
# Remove unused volumes / force the system prune for old images as well.
|
||||||
|
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||||
|
echo "Docker images and volumes cleanup completed."
|
||||||
|
else
|
||||||
|
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||||
|
fi
|
||||||
14
.buildkite/scripts/tpu/config_v6e_1.env
Normal file
14
.buildkite/scripts/tpu/config_v6e_1.env
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
# Environment config
|
||||||
|
TEST_NAME=llama8b
|
||||||
|
CONTAINER_NAME=vllm-tpu
|
||||||
|
|
||||||
|
# vllm config
|
||||||
|
MODEL=meta-llama/Llama-3.1-8B-Instruct
|
||||||
|
MAX_NUM_SEQS=256
|
||||||
|
MAX_NUM_BATCHED_TOKENS=1024
|
||||||
|
TENSOR_PARALLEL_SIZE=1
|
||||||
|
MAX_MODEL_LEN=2048
|
||||||
|
DOWNLOAD_DIR=/mnt/disks/persist
|
||||||
|
EXPECTED_THROUGHPUT=8.0
|
||||||
|
INPUT_LEN=1800
|
||||||
|
OUTPUT_LEN=128
|
||||||
102
.buildkite/scripts/tpu/docker_run_bm.sh
Executable file
102
.buildkite/scripts/tpu/docker_run_bm.sh
Executable file
@ -0,0 +1,102 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
if [ ! -f "$1" ]; then
|
||||||
|
echo "Error: The env file '$1' does not exist."
|
||||||
|
exit 1 # Exit the script with a non-zero status to indicate an error
|
||||||
|
fi
|
||||||
|
|
||||||
|
ENV_FILE=$1
|
||||||
|
|
||||||
|
# For testing on local vm, use `set -a` to export all variables
|
||||||
|
source /etc/environment
|
||||||
|
source $ENV_FILE
|
||||||
|
|
||||||
|
remove_docker_container() {
|
||||||
|
docker rm -f tpu-test || true;
|
||||||
|
docker rm -f vllm-tpu || true;
|
||||||
|
docker rm -f $CONTAINER_NAME || true;
|
||||||
|
}
|
||||||
|
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
|
||||||
|
# Remove the container that might not be cleaned up in the previous run.
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Build docker image.
|
||||||
|
# TODO: build the image outside the script and share the image with other
|
||||||
|
# tpu test if building time is too long.
|
||||||
|
DOCKER_BUILDKIT=1 docker build \
|
||||||
|
--build-arg max_jobs=16 \
|
||||||
|
--build-arg USE_SCCACHE=1 \
|
||||||
|
--build-arg GIT_REPO_CHECK=0 \
|
||||||
|
--tag vllm/vllm-tpu-bm \
|
||||||
|
--progress plain -f docker/Dockerfile.tpu .
|
||||||
|
|
||||||
|
LOG_ROOT=$(mktemp -d)
|
||||||
|
# If mktemp fails, set -e will cause the script to exit.
|
||||||
|
echo "Results will be stored in: $LOG_ROOT"
|
||||||
|
|
||||||
|
if [ -z "$HF_TOKEN" ]; then
|
||||||
|
echo "Error: HF_TOKEN is not set or is empty."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Make sure mounted disk or dir exists
|
||||||
|
if [ ! -d "$DOWNLOAD_DIR" ]; then
|
||||||
|
echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "Run model $MODEL"
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "starting docker...$CONTAINER_NAME"
|
||||||
|
echo
|
||||||
|
docker run \
|
||||||
|
-v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
|
||||||
|
--env-file $ENV_FILE \
|
||||||
|
-e HF_TOKEN="$HF_TOKEN" \
|
||||||
|
-e TARGET_COMMIT=$BUILDKITE_COMMIT \
|
||||||
|
-e MODEL=$MODEL \
|
||||||
|
-e WORKSPACE=/workspace \
|
||||||
|
--name $CONTAINER_NAME \
|
||||||
|
-d \
|
||||||
|
--privileged \
|
||||||
|
--network host \
|
||||||
|
-v /dev/shm:/dev/shm \
|
||||||
|
vllm/vllm-tpu-bm tail -f /dev/null
|
||||||
|
|
||||||
|
echo "run script..."
|
||||||
|
echo
|
||||||
|
docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/tpu/run_bm.sh"
|
||||||
|
|
||||||
|
echo "copy result back..."
|
||||||
|
VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
|
||||||
|
BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
|
||||||
|
docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG"
|
||||||
|
docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
|
||||||
|
|
||||||
|
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
|
||||||
|
echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
|
||||||
|
|
||||||
|
if [ "$BUILDKITE" = "true" ]; then
|
||||||
|
echo "Running inside Buildkite"
|
||||||
|
buildkite-agent artifact upload "$VLLM_LOG"
|
||||||
|
buildkite-agent artifact upload "$BM_LOG"
|
||||||
|
else
|
||||||
|
echo "Not running inside Buildkite"
|
||||||
|
fi
|
||||||
|
|
||||||
|
#
|
||||||
|
# compare the throughput with EXPECTED_THROUGHPUT
|
||||||
|
# and assert meeting the expectation
|
||||||
|
#
|
||||||
|
if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
|
||||||
|
echo "Failed to get the throughput"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
|
||||||
|
echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
14
.buildkite/scripts/tpu/quantized_v6e_1.env
Normal file
14
.buildkite/scripts/tpu/quantized_v6e_1.env
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
# Environment config
|
||||||
|
TEST_NAME=llama8bw8a8
|
||||||
|
CONTAINER_NAME=vllm-tpu
|
||||||
|
|
||||||
|
# vllm config
|
||||||
|
MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8
|
||||||
|
MAX_NUM_SEQS=128
|
||||||
|
MAX_NUM_BATCHED_TOKENS=1024
|
||||||
|
TENSOR_PARALLEL_SIZE=1
|
||||||
|
MAX_MODEL_LEN=2048
|
||||||
|
DOWNLOAD_DIR=/mnt/disks/persist
|
||||||
|
EXPECTED_THROUGHPUT=10.0
|
||||||
|
INPUT_LEN=1800
|
||||||
|
OUTPUT_LEN=128
|
||||||
94
.buildkite/scripts/tpu/run_bm.sh
Executable file
94
.buildkite/scripts/tpu/run_bm.sh
Executable file
@ -0,0 +1,94 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
VLLM_LOG="$WORKSPACE/vllm_log.txt"
|
||||||
|
BM_LOG="$WORKSPACE/bm_log.txt"
|
||||||
|
|
||||||
|
if [ -n "$TARGET_COMMIT" ]; then
|
||||||
|
head_hash=$(git rev-parse HEAD)
|
||||||
|
if [ "$TARGET_COMMIT" != "$head_hash" ]; then
|
||||||
|
echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "model: $MODEL"
|
||||||
|
echo
|
||||||
|
|
||||||
|
#
|
||||||
|
# create a log folder
|
||||||
|
#
|
||||||
|
mkdir "$WORKSPACE/log"
|
||||||
|
|
||||||
|
# TODO: Move to image building.
|
||||||
|
pip install pandas
|
||||||
|
pip install datasets
|
||||||
|
|
||||||
|
#
|
||||||
|
# create sonnet_4x
|
||||||
|
#
|
||||||
|
echo "Create sonnet_4x.txt"
|
||||||
|
echo "" > benchmarks/sonnet_4x.txt
|
||||||
|
for _ in {1..4}
|
||||||
|
do
|
||||||
|
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
|
||||||
|
done
|
||||||
|
|
||||||
|
#
|
||||||
|
# start vllm service in backend
|
||||||
|
#
|
||||||
|
echo "lanching vllm..."
|
||||||
|
echo "logging to $VLLM_LOG"
|
||||||
|
echo
|
||||||
|
|
||||||
|
VLLM_USE_V1=1 vllm serve $MODEL \
|
||||||
|
--seed 42 \
|
||||||
|
--disable-log-requests \
|
||||||
|
--max-num-seqs $MAX_NUM_SEQS \
|
||||||
|
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
||||||
|
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
|
||||||
|
--no-enable-prefix-caching \
|
||||||
|
--download_dir $DOWNLOAD_DIR \
|
||||||
|
--max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
|
||||||
|
|
||||||
|
|
||||||
|
echo "wait for 20 minutes.."
|
||||||
|
echo
|
||||||
|
# sleep 1200
|
||||||
|
# wait for 10 minutes...
|
||||||
|
for i in {1..120}; do
|
||||||
|
# TODO: detect other type of errors.
|
||||||
|
if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
|
||||||
|
echo "Detected RuntimeError, exiting."
|
||||||
|
exit 1
|
||||||
|
elif grep -Fq "Application startup complete" "$VLLM_LOG"; then
|
||||||
|
echo "Application started"
|
||||||
|
break
|
||||||
|
else
|
||||||
|
echo "wait for 10 seconds..."
|
||||||
|
sleep 10
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
#
|
||||||
|
# run test
|
||||||
|
#
|
||||||
|
echo "run benchmark test..."
|
||||||
|
echo "logging to $BM_LOG"
|
||||||
|
echo
|
||||||
|
python benchmarks/benchmark_serving.py \
|
||||||
|
--backend vllm \
|
||||||
|
--model $MODEL \
|
||||||
|
--dataset-name sonnet \
|
||||||
|
--dataset-path benchmarks/sonnet_4x.txt \
|
||||||
|
--sonnet-input-len $INPUT_LEN \
|
||||||
|
--sonnet-output-len $OUTPUT_LEN \
|
||||||
|
--ignore-eos > "$BM_LOG"
|
||||||
|
|
||||||
|
echo "completed..."
|
||||||
|
echo
|
||||||
|
|
||||||
|
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
|
||||||
|
echo "throughput: $throughput"
|
||||||
|
echo
|
||||||
@ -50,11 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
|||||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||||
# if $normal_wheel matches cu118, do not upload the index.html
|
# if $normal_wheel matches cu118, do not upload the index.html
|
||||||
echo "Skipping index files for cu118 wheels"
|
echo "Skipping index files for cu118 wheels"
|
||||||
elif [[ $normal_wheel == *"cu121"* ]]; then
|
elif [[ $normal_wheel == *"cu126"* ]]; then
|
||||||
# if $normal_wheel matches cu121, do not upload the index.html
|
# if $normal_wheel matches cu126, do not upload the index.html
|
||||||
echo "Skipping index files for cu121 wheels"
|
echo "Skipping index files for cu126 wheels"
|
||||||
else
|
else
|
||||||
# only upload index.html for cu124 wheels (default wheels)
|
# only upload index.html for cu128 wheels (default wheels)
|
||||||
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||||
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||||
fi
|
fi
|
||||||
@ -66,12 +66,13 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
|||||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||||
# if $normal_wheel matches cu118, do not upload the index.html
|
# if $normal_wheel matches cu118, do not upload the index.html
|
||||||
echo "Skipping index files for cu118 wheels"
|
echo "Skipping index files for cu118 wheels"
|
||||||
elif [[ $normal_wheel == *"cu121"* ]]; then
|
elif [[ $normal_wheel == *"cu126"* ]]; then
|
||||||
# if $normal_wheel matches cu121, do not upload the index.html
|
# if $normal_wheel matches cu126, do not upload the index.html
|
||||||
echo "Skipping index files for cu121 wheels"
|
echo "Skipping index files for cu126 wheels"
|
||||||
else
|
else
|
||||||
# only upload index.html for cu124 wheels (default wheels)
|
# only upload index.html for cu128 wheels (default wheels)
|
||||||
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
||||||
|
aws s3 cp index.html "s3://vllm-wheels/$version/vllm/index.html"
|
||||||
|
|||||||
@ -32,16 +32,27 @@ steps:
|
|||||||
##### fast check tests #####
|
##### fast check tests #####
|
||||||
|
|
||||||
- label: Documentation Build # 2min
|
- label: Documentation Build # 2min
|
||||||
working_dir: "/vllm-workspace/test_docs/docs"
|
mirror_hardwares: [amdexperimental]
|
||||||
|
working_dir: "/vllm-workspace/test_docs"
|
||||||
fast_check: true
|
fast_check: true
|
||||||
no_gpu: True
|
no_gpu: True
|
||||||
commands:
|
commands:
|
||||||
- pip install -r ../../requirements/docs.txt
|
- pip install -r ../requirements/docs.txt
|
||||||
- SPHINXOPTS=\"-W\" make html
|
# TODO: add `--strict` once warnings in docstrings are fixed
|
||||||
# Check API reference (if it fails, you may have missing mock imports)
|
- mkdocs build
|
||||||
- grep \"sig sig-object py\" build/html/api/inference_params.html
|
|
||||||
|
- label: Pytorch Nightly Dependency Override Check # 2min
|
||||||
|
# if this test fails, it means the nightly torch version is not compatible with some
|
||||||
|
# of the dependencies. Please check the error message and add the package to whitelist
|
||||||
|
# in /vllm/tools/generate_nightly_torch_test.py
|
||||||
|
soft_fail: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- requirements/nightly_torch_test.txt
|
||||||
|
commands:
|
||||||
|
- bash standalone_tests/pytorch_nightly_dependency.sh
|
||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/mq_llm_engine
|
- tests/mq_llm_engine
|
||||||
@ -57,11 +68,13 @@ steps:
|
|||||||
- pytest -v -s async_engine # AsyncLLMEngine
|
- pytest -v -s async_engine # AsyncLLMEngine
|
||||||
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
|
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
|
||||||
- pytest -v -s test_inputs.py
|
- pytest -v -s test_inputs.py
|
||||||
|
- pytest -v -s test_outputs.py
|
||||||
- pytest -v -s multimodal
|
- pytest -v -s multimodal
|
||||||
- pytest -v -s test_utils.py # Utils
|
- pytest -v -s test_utils.py # Utils
|
||||||
- pytest -v -s worker # Worker
|
- pytest -v -s worker # Worker
|
||||||
|
|
||||||
- label: Python-only Installation Test
|
- label: Python-only Installation Test
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- tests/standalone_tests/python_only_compile.sh
|
- tests/standalone_tests/python_only_compile.sh
|
||||||
- setup.py
|
- setup.py
|
||||||
@ -69,7 +82,7 @@ steps:
|
|||||||
- bash standalone_tests/python_only_compile.sh
|
- bash standalone_tests/python_only_compile.sh
|
||||||
|
|
||||||
- label: Basic Correctness Test # 30min
|
- label: Basic Correctness Test # 30min
|
||||||
#mirror_hardwares: [amd]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
fast_check: true
|
fast_check: true
|
||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -86,6 +99,7 @@ steps:
|
|||||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||||
|
|
||||||
- label: Chunked Prefill Test
|
- label: Chunked Prefill Test
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/basic_correctness/test_chunked_prefill
|
- tests/basic_correctness/test_chunked_prefill
|
||||||
@ -94,7 +108,7 @@ steps:
|
|||||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||||
|
|
||||||
- label: Core Test # 10min
|
- label: Core Test # 10min
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
fast_check: true
|
fast_check: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/core
|
- vllm/core
|
||||||
@ -104,10 +118,10 @@ steps:
|
|||||||
- pytest -v -s core
|
- pytest -v -s core
|
||||||
|
|
||||||
- label: Entrypoints Test # 40min
|
- label: Entrypoints Test # 40min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
fast_check: true
|
fast_check: true
|
||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
#mirror_hardwares: [amd]
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/entrypoints/llm
|
- tests/entrypoints/llm
|
||||||
@ -121,11 +135,12 @@ steps:
|
|||||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
||||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/ --ignore=entrypoints/openai/test_openai_schema.py
|
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_tensorizer_entrypoint.py --ignore=entrypoints/openai/correctness/
|
||||||
- pytest -v -s entrypoints/test_chat_utils.py
|
- pytest -v -s entrypoints/test_chat_utils.py
|
||||||
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||||
|
|
||||||
- label: Distributed Tests (4 GPUs) # 10min
|
- label: Distributed Tests (4 GPUs) # 10min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -133,32 +148,57 @@ steps:
|
|||||||
- vllm/core/
|
- vllm/core/
|
||||||
- tests/distributed/test_utils
|
- tests/distributed/test_utils
|
||||||
- tests/distributed/test_pynccl
|
- tests/distributed/test_pynccl
|
||||||
|
- tests/distributed/test_events
|
||||||
- tests/spec_decode/e2e/test_integration_dist_tp4
|
- tests/spec_decode/e2e/test_integration_dist_tp4
|
||||||
- tests/compile/test_basic_correctness
|
- tests/compile/test_basic_correctness
|
||||||
- examples/offline_inference/rlhf.py
|
- examples/offline_inference/rlhf.py
|
||||||
- examples/offline_inference/rlhf_colocate.py
|
- examples/offline_inference/rlhf_colocate.py
|
||||||
- tests/examples/offline_inference/data_parallel.py
|
- tests/examples/offline_inference/data_parallel.py
|
||||||
- tests/v1/test_async_llm_dp.py
|
- tests/v1/test_async_llm_dp.py
|
||||||
|
- tests/v1/test_external_lb_dp.py
|
||||||
|
- tests/v1/engine/test_engine_core_client.py
|
||||||
commands:
|
commands:
|
||||||
# test with tp=2 and external_dp=2
|
# test with tp=2 and external_dp=2
|
||||||
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||||
|
# test with tp=2 and pp=2
|
||||||
|
- PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||||
# test with internal dp
|
# test with internal dp
|
||||||
- python3 ../examples/offline_inference/data_parallel.py
|
- python3 ../examples/offline_inference/data_parallel.py --enforce-eager
|
||||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||||
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
|
||||||
|
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||||
- pytest -v -s distributed/test_utils.py
|
- pytest -v -s distributed/test_utils.py
|
||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
|
- pytest -v -s distributed/test_events.py
|
||||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
||||||
# TODO: create a dedicated test section for multi-GPU example tests
|
# TODO: create a dedicated test section for multi-GPU example tests
|
||||||
# when we have multiple distributed example tests
|
# when we have multiple distributed example tests
|
||||||
- pushd ../examples/offline_inference
|
- pushd ../examples/offline_inference
|
||||||
- python3 rlhf.py
|
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 python3 rlhf.py
|
||||||
- RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||||
- popd
|
- popd
|
||||||
|
|
||||||
|
- label: EPLB Algorithm Test
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed/eplb
|
||||||
|
- tests/distributed/test_eplb_algo.py
|
||||||
|
commands:
|
||||||
|
- pytest -v -s distributed/test_eplb_algo.py
|
||||||
|
|
||||||
|
- label: EPLB Execution Test # 5min
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 4
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed/eplb
|
||||||
|
- tests/distributed/test_eplb_execute.py
|
||||||
|
commands:
|
||||||
|
- pytest -v -s distributed/test_eplb_execute.py
|
||||||
|
|
||||||
- label: Metrics, Tracing Test # 10min
|
- label: Metrics, Tracing Test # 10min
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@ -166,13 +206,18 @@ steps:
|
|||||||
- tests/tracing
|
- tests/tracing
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s metrics
|
- pytest -v -s metrics
|
||||||
|
- "pip install \
|
||||||
|
'opentelemetry-sdk>=1.26.0' \
|
||||||
|
'opentelemetry-api>=1.26.0' \
|
||||||
|
'opentelemetry-exporter-otlp>=1.26.0' \
|
||||||
|
'opentelemetry-semantic-conventions-ai>=0.4.1'"
|
||||||
- pytest -v -s tracing
|
- pytest -v -s tracing
|
||||||
|
|
||||||
##### fast check tests #####
|
##### fast check tests #####
|
||||||
##### 1 GPU test #####
|
##### 1 GPU test #####
|
||||||
|
|
||||||
- label: Regression Test # 5min
|
- label: Regression Test # 5min
|
||||||
#mirror_hardwares: [amd]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/test_regression
|
- tests/test_regression
|
||||||
@ -182,7 +227,7 @@ steps:
|
|||||||
working_dir: "/vllm-workspace/tests" # optional
|
working_dir: "/vllm-workspace/tests" # optional
|
||||||
|
|
||||||
- label: Engine Test # 10min
|
- label: Engine Test # 10min
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/engine
|
- tests/engine
|
||||||
@ -190,13 +235,14 @@ steps:
|
|||||||
- tests/test_sequence
|
- tests/test_sequence
|
||||||
- tests/test_config
|
- tests/test_config
|
||||||
- tests/test_logger
|
- tests/test_logger
|
||||||
|
- tests/test_vllm_port
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
|
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||||
# OOM in the CI unless we run this separately
|
# OOM in the CI unless we run this separately
|
||||||
- pytest -v -s tokenization
|
- pytest -v -s tokenization
|
||||||
|
|
||||||
- label: V1 Test
|
- label: V1 Test
|
||||||
#mirror_hardwares: [amd]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/v1
|
- tests/v1
|
||||||
@ -209,10 +255,11 @@ steps:
|
|||||||
- pytest -v -s v1/worker
|
- pytest -v -s v1/worker
|
||||||
- pytest -v -s v1/structured_output
|
- pytest -v -s v1/structured_output
|
||||||
- pytest -v -s v1/spec_decode
|
- pytest -v -s v1/spec_decode
|
||||||
|
- pytest -v -s v1/kv_connector/unit
|
||||||
- pytest -v -s v1/test_serial_utils.py
|
- pytest -v -s v1/test_serial_utils.py
|
||||||
- pytest -v -s v1/test_stats.py
|
|
||||||
- pytest -v -s v1/test_utils.py
|
- pytest -v -s v1/test_utils.py
|
||||||
- pytest -v -s v1/test_oracle.py
|
- pytest -v -s v1/test_oracle.py
|
||||||
|
- pytest -v -s v1/test_metrics_reader.py
|
||||||
# TODO: accuracy does not match, whether setting
|
# TODO: accuracy does not match, whether setting
|
||||||
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||||
- pytest -v -s v1/e2e
|
- pytest -v -s v1/e2e
|
||||||
@ -221,8 +268,8 @@ steps:
|
|||||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||||
|
|
||||||
- label: Examples Test # 25min
|
- label: Examples Test # 25min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/examples"
|
working_dir: "/vllm-workspace/examples"
|
||||||
#mirror_hardwares: [amd]
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/entrypoints
|
- vllm/entrypoints
|
||||||
- examples/
|
- examples/
|
||||||
@ -237,7 +284,7 @@ steps:
|
|||||||
- python3 offline_inference/vision_language.py --seed 0
|
- python3 offline_inference/vision_language.py --seed 0
|
||||||
- python3 offline_inference/vision_language_embedding.py --seed 0
|
- python3 offline_inference/vision_language_embedding.py --seed 0
|
||||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||||
- VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
- VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
- python3 offline_inference/encoder_decoder.py
|
- python3 offline_inference/encoder_decoder.py
|
||||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||||
- python3 offline_inference/basic/classify.py
|
- python3 offline_inference/basic/classify.py
|
||||||
@ -246,14 +293,24 @@ steps:
|
|||||||
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
||||||
|
|
||||||
- label: Prefix Caching Test # 9min
|
- label: Prefix Caching Test # 9min
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/prefix_caching
|
- tests/prefix_caching
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s prefix_caching
|
- pytest -v -s prefix_caching
|
||||||
|
|
||||||
|
|
||||||
|
- label: Platform Tests (CUDA)
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/cuda
|
||||||
|
commands:
|
||||||
|
- pytest -v -s cuda/test_cuda_context.py
|
||||||
|
|
||||||
- label: Samplers Test # 36min
|
- label: Samplers Test # 36min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor/layers
|
- vllm/model_executor/layers
|
||||||
- vllm/sampling_metadata.py
|
- vllm/sampling_metadata.py
|
||||||
@ -263,18 +320,8 @@ steps:
|
|||||||
- pytest -v -s samplers
|
- pytest -v -s samplers
|
||||||
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
||||||
|
|
||||||
- label: LogitsProcessor Test # 5min
|
|
||||||
mirror_hardwares: [amd]
|
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/model_executor/layers
|
|
||||||
- vllm/model_executor/guided_decoding
|
|
||||||
- tests/test_logits_processor
|
|
||||||
- tests/model_executor/test_guided_processors
|
|
||||||
commands:
|
|
||||||
- pytest -v -s test_logits_processor.py
|
|
||||||
- pytest -v -s model_executor/test_guided_processors.py
|
|
||||||
|
|
||||||
- label: Speculative decoding tests # 40min
|
- label: Speculative decoding tests # 40min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/spec_decode
|
- vllm/spec_decode
|
||||||
- tests/spec_decode
|
- tests/spec_decode
|
||||||
@ -285,7 +332,7 @@ steps:
|
|||||||
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
|
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
|
||||||
|
|
||||||
- label: LoRA Test %N # 15min each
|
- label: LoRA Test %N # 15min each
|
||||||
#mirror_hardwares: [amd]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/lora
|
- vllm/lora
|
||||||
- tests/lora
|
- tests/lora
|
||||||
@ -293,14 +340,22 @@ steps:
|
|||||||
parallelism: 4
|
parallelism: 4
|
||||||
|
|
||||||
- label: PyTorch Compilation Unit Tests
|
- label: PyTorch Compilation Unit Tests
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s compile/test_pass_manager.py
|
- pytest -v -s compile/test_pass_manager.py
|
||||||
- pytest -v -s compile/test_fusion.py
|
- pytest -v -s compile/test_fusion.py
|
||||||
|
- pytest -v -s compile/test_fusion_attn.py
|
||||||
|
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
||||||
|
- pytest -v -s compile/test_sequence_parallelism.py
|
||||||
|
- pytest -v -s compile/test_async_tp.py
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test # 9min
|
- label: PyTorch Fullgraph Smoke Test # 9min
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
@ -309,8 +364,11 @@ steps:
|
|||||||
# these tests need to be separated, cannot combine
|
# these tests need to be separated, cannot combine
|
||||||
- pytest -v -s compile/piecewise/test_simple.py
|
- pytest -v -s compile/piecewise/test_simple.py
|
||||||
- pytest -v -s compile/piecewise/test_toy_llama.py
|
- pytest -v -s compile/piecewise/test_toy_llama.py
|
||||||
|
- pytest -v -s compile/piecewise/test_full_cudagraph.py
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Test # 18min
|
- label: PyTorch Fullgraph Test # 18min
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
@ -318,6 +376,7 @@ steps:
|
|||||||
- pytest -v -s compile/test_full_graph.py
|
- pytest -v -s compile/test_full_graph.py
|
||||||
|
|
||||||
- label: Kernels Core Operation Test
|
- label: Kernels Core Operation Test
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- tests/kernels/core
|
- tests/kernels/core
|
||||||
@ -325,6 +384,7 @@ steps:
|
|||||||
- pytest -v -s kernels/core
|
- pytest -v -s kernels/core
|
||||||
|
|
||||||
- label: Kernels Attention Test %N
|
- label: Kernels Attention Test %N
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/attention/
|
- csrc/attention/
|
||||||
- vllm/attention
|
- vllm/attention
|
||||||
@ -335,6 +395,7 @@ steps:
|
|||||||
parallelism: 2
|
parallelism: 2
|
||||||
|
|
||||||
- label: Kernels Quantization Test %N
|
- label: Kernels Quantization Test %N
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/quantization/
|
- csrc/quantization/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
@ -344,6 +405,7 @@ steps:
|
|||||||
parallelism: 2
|
parallelism: 2
|
||||||
|
|
||||||
- label: Kernels MoE Test
|
- label: Kernels MoE Test
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/moe/
|
- csrc/moe/
|
||||||
- tests/kernels/moe
|
- tests/kernels/moe
|
||||||
@ -352,6 +414,7 @@ steps:
|
|||||||
- pytest -v -s kernels/moe
|
- pytest -v -s kernels/moe
|
||||||
|
|
||||||
- label: Kernels Mamba Test
|
- label: Kernels Mamba Test
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/mamba/
|
- csrc/mamba/
|
||||||
- tests/kernels/mamba
|
- tests/kernels/mamba
|
||||||
@ -359,48 +422,69 @@ steps:
|
|||||||
- pytest -v -s kernels/mamba
|
- pytest -v -s kernels/mamba
|
||||||
|
|
||||||
- label: Tensorizer Test # 11min
|
- label: Tensorizer Test # 11min
|
||||||
# mirror_hardwares: [amd]
|
mirror_hardwares: [amdexperimental]
|
||||||
soft_fail: true
|
soft_fail: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/model_executor/model_loader
|
- vllm/model_executor/model_loader
|
||||||
- tests/tensorizer_loader
|
- tests/tensorizer_loader
|
||||||
|
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
commands:
|
commands:
|
||||||
- apt-get update && apt-get install -y curl libsodium23
|
- apt-get update && apt-get install -y curl libsodium23
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s tensorizer_loader
|
- pytest -v -s tensorizer_loader
|
||||||
|
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
|
|
||||||
|
- label: Model Executor Test
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
soft_fail: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/model_executor
|
||||||
|
- tests/model_executor
|
||||||
|
commands:
|
||||||
|
- apt-get update && apt-get install -y curl libsodium23
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pytest -v -s model_executor
|
||||||
|
|
||||||
- label: Benchmarks # 9min
|
- label: Benchmarks # 9min
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
working_dir: "/vllm-workspace/.buildkite"
|
working_dir: "/vllm-workspace/.buildkite"
|
||||||
mirror_hardwares: [amd]
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- benchmarks/
|
- benchmarks/
|
||||||
commands:
|
commands:
|
||||||
- bash scripts/run-benchmarks.sh
|
- bash scripts/run-benchmarks.sh
|
||||||
|
|
||||||
- label: Benchmarks CLI Test # 10min
|
- label: Benchmarks CLI Test # 10min
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/benchmarks/
|
- tests/benchmarks/
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s benchmarks/
|
- pytest -v -s benchmarks/
|
||||||
|
|
||||||
- label: Quantization Test # 33min
|
- label: Quantization Test
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
- tests/quantization
|
- tests/quantization
|
||||||
command: VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
commands:
|
||||||
|
# temporary install here since we need nightly, will move to requirements/test.in
|
||||||
|
# after torchao 0.12 release
|
||||||
|
- pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
|
||||||
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 53min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- bash ./run-tests.sh -c configs/models-small.txt -t 1
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-small.txt --tp-size=1
|
||||||
|
|
||||||
- label: OpenAI API correctness
|
- label: OpenAI API correctness
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- vllm/entrypoints/openai/
|
- vllm/entrypoints/openai/
|
||||||
@ -409,6 +493,7 @@ steps:
|
|||||||
- pytest -s entrypoints/openai/correctness/
|
- pytest -s entrypoints/openai/correctness/
|
||||||
|
|
||||||
- label: Encoder Decoder tests # 5min
|
- label: Encoder Decoder tests # 5min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/encoder_decoder
|
- tests/encoder_decoder
|
||||||
@ -416,8 +501,8 @@ steps:
|
|||||||
- pytest -v -s encoder_decoder
|
- pytest -v -s encoder_decoder
|
||||||
|
|
||||||
- label: OpenAI-Compatible Tool Use # 20 min
|
- label: OpenAI-Compatible Tool Use # 20 min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
fast_check: false
|
fast_check: false
|
||||||
#mirror_hardwares: [ amd ]
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/tool_use
|
- tests/tool_use
|
||||||
@ -429,92 +514,115 @@ steps:
|
|||||||
##### models test #####
|
##### models test #####
|
||||||
|
|
||||||
- label: Basic Models Test # 24min
|
- label: Basic Models Test # 24min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models
|
- tests/models
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s models/test_transformers.py
|
- pytest -v -s models/test_transformers.py
|
||||||
- pytest -v -s models/test_registry.py
|
- pytest -v -s models/test_registry.py
|
||||||
# V1 Test: https://github.com/vllm-project/vllm/issues/14531
|
- pytest -v -s models/test_utils.py
|
||||||
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'
|
- pytest -v -s models/test_vision.py
|
||||||
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'llama4'
|
- pytest -v -s models/test_initialization.py
|
||||||
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'plamo2'
|
|
||||||
|
|
||||||
- label: Language Models Test (Standard) # 32min
|
- label: Language Models Test (Standard)
|
||||||
#mirror_hardwares: [amd]
|
mirror_hardwares: [amdexperimental]
|
||||||
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/decoder_only/language
|
- tests/models/language
|
||||||
- tests/models/embedding/language
|
|
||||||
- tests/models/encoder_decoder/language
|
|
||||||
commands:
|
commands:
|
||||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
||||||
- pip install causal-conv1d
|
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
||||||
- pytest -v -s models/decoder_only/language -m 'core_model or quant_model'
|
- pip freeze | grep -E 'torch'
|
||||||
- pytest -v -s models/embedding/language -m core_model
|
- pytest -v -s models/language -m core_model
|
||||||
|
|
||||||
- label: Language Models Test (Extended) # 1h10min
|
- label: Language Models Test (Hybrid) # 35 min
|
||||||
optional: true
|
mirror_hardwares: [amdexperimental]
|
||||||
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/decoder_only/language
|
- tests/models/language/generation
|
||||||
- tests/models/embedding/language
|
|
||||||
- tests/models/encoder_decoder/language
|
|
||||||
commands:
|
commands:
|
||||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
||||||
- pip install causal-conv1d
|
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
||||||
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
|
- pytest -v -s models/language/generation -m hybrid_model
|
||||||
- pytest -v -s models/embedding/language -m 'not core_model'
|
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Standard) # 40min
|
- label: Language Models Test (Extended Generation) # 1hr20min
|
||||||
#mirror_hardwares: [amd]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
|
||||||
- vllm/
|
|
||||||
- tests/models/decoder_only/audio_language
|
|
||||||
- tests/models/decoder_only/vision_language
|
|
||||||
- tests/models/embedding/vision_language
|
|
||||||
- tests/models/encoder_decoder/audio_language
|
|
||||||
- tests/models/encoder_decoder/vision_language
|
|
||||||
commands:
|
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
|
||||||
- pytest -v -s models/multimodal
|
|
||||||
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
|
|
||||||
- pytest -v -s models/decoder_only/vision_language -m 'core_model or quant_model'
|
|
||||||
- pytest -v -s models/embedding/vision_language -m core_model
|
|
||||||
- pytest -v -s models/encoder_decoder/audio_language -m core_model
|
|
||||||
- pytest -v -s models/encoder_decoder/language -m core_model
|
|
||||||
- pytest -v -s models/encoder_decoder/vision_language -m core_model
|
|
||||||
- pytest -v -s models/decoder_only/vision_language/test_interleaved.py
|
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 1 # 48m
|
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/decoder_only/audio_language
|
- tests/models/language/generation
|
||||||
- tests/models/decoder_only/vision_language
|
|
||||||
- tests/models/embedding/vision_language
|
|
||||||
- tests/models/encoder_decoder/vision_language
|
|
||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
||||||
- pytest -v -s models/decoder_only/audio_language -m 'not core_model and not quant_model'
|
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
||||||
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=0) and not core_model and not quant_model'
|
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
|
||||||
- pytest -v -s --ignore models/decoder_only/vision_language/test_models.py models/decoder_only/vision_language -m 'not core_model and not quant_model'
|
|
||||||
- pytest -v -s models/embedding/vision_language -m 'not core_model'
|
|
||||||
- pytest -v -s models/encoder_decoder/language -m 'not core_model'
|
|
||||||
- pytest -v -s models/encoder_decoder/vision_language -m 'not core_model'
|
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 2 # 38m
|
- label: Language Models Test (Extended Pooling) # 36min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/decoder_only/vision_language
|
- tests/models/language/pooling
|
||||||
|
commands:
|
||||||
|
- pytest -v -s models/language/pooling -m 'not core_model'
|
||||||
|
|
||||||
|
- label: Multi-Modal Models Test (Standard)
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
torch_nightly: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/multimodal
|
||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
- pytest -v -s models/decoder_only/vision_language/test_models.py -m 'split(group=1) and not core_model and not quant_model'
|
- pip freeze | grep -E 'torch'
|
||||||
|
- pytest -v -s models/multimodal/processing
|
||||||
|
- pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
|
||||||
|
- cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||||
|
|
||||||
|
- label: Multi-Modal Models Test (Extended) 1
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/multimodal
|
||||||
|
commands:
|
||||||
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
|
- pytest -v -s --ignore models/multimodal/generation/test_common.py --ignore models/multimodal/processing models/multimodal -m 'not core_model'
|
||||||
|
|
||||||
|
- label: Multi-Modal Models Test (Extended) 2
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/multimodal
|
||||||
|
commands:
|
||||||
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
|
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=0) and not core_model'
|
||||||
|
|
||||||
|
- label: Multi-Modal Models Test (Extended) 3
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
optional: true
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/
|
||||||
|
- tests/models/multimodal
|
||||||
|
commands:
|
||||||
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
|
- pytest -v -s models/multimodal/generation/test_common.py -m 'split(group=1) and not core_model'
|
||||||
|
|
||||||
|
- label: Quantized Models Test
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
- tests/models/quantization
|
||||||
|
commands:
|
||||||
|
- pytest -v -s models/quantization
|
||||||
|
|
||||||
# This test is used only in PR development phase to test individual models and should never run on main
|
# This test is used only in PR development phase to test individual models and should never run on main
|
||||||
- label: Custom Models Test
|
- label: Custom Models Test
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
optional: true
|
optional: true
|
||||||
commands:
|
commands:
|
||||||
- echo 'Testing custom models...'
|
- echo 'Testing custom models...'
|
||||||
@ -526,7 +634,7 @@ steps:
|
|||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
|
|
||||||
- label: Distributed Comm Ops Test # 7min
|
- label: Distributed Comm Ops Test # 7min
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -537,6 +645,7 @@ steps:
|
|||||||
- pytest -v -s distributed/test_shm_broadcast.py
|
- pytest -v -s distributed/test_shm_broadcast.py
|
||||||
|
|
||||||
- label: 2 Node Tests (4 GPUs in total) # 16min
|
- label: 2 Node Tests (4 GPUs in total) # 16min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
num_nodes: 2
|
num_nodes: 2
|
||||||
@ -546,16 +655,21 @@ steps:
|
|||||||
- vllm/executor/
|
- vllm/executor/
|
||||||
- vllm/model_executor/models/
|
- vllm/model_executor/models/
|
||||||
- tests/distributed/
|
- tests/distributed/
|
||||||
|
- tests/examples/offline_inference/data_parallel.py
|
||||||
commands:
|
commands:
|
||||||
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
||||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
|
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||||
|
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
|
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
|
||||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
||||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
|
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||||
|
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||||
|
|
||||||
- label: Distributed Tests (2 GPUs) # 40min
|
- label: Distributed Tests (2 GPUs) # 40min
|
||||||
#mirror_hardwares: [amd]
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -570,9 +684,13 @@ steps:
|
|||||||
- vllm/worker/model_runner.py
|
- vllm/worker/model_runner.py
|
||||||
- entrypoints/llm/test_collective_rpc.py
|
- entrypoints/llm/test_collective_rpc.py
|
||||||
- tests/v1/test_async_llm_dp.py
|
- tests/v1/test_async_llm_dp.py
|
||||||
|
- tests/v1/test_external_lb_dp.py
|
||||||
|
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
- vllm/v1/engine/
|
- vllm/v1/engine/
|
||||||
commands:
|
commands:
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||||
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
|
||||||
|
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||||
- pytest -v -s ./compile/test_basic_correctness.py
|
- pytest -v -s ./compile/test_basic_correctness.py
|
||||||
- pytest -v -s ./compile/test_wrapper.py
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
@ -580,9 +698,10 @@ steps:
|
|||||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||||
# Avoid importing model tests that cause CUDA reinitialization error
|
# Avoid importing model tests that cause CUDA reinitialization error
|
||||||
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/language -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/multimodal -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
|
# test sequence parallel
|
||||||
|
- pytest -v -s distributed/test_sequence_parallel.py
|
||||||
# this test fails consistently.
|
# this test fails consistently.
|
||||||
# TODO: investigate and fix
|
# TODO: investigate and fix
|
||||||
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||||
@ -591,13 +710,14 @@ steps:
|
|||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||||
|
|
||||||
- label: Plugin Tests (2 GPUs) # 40min
|
- label: Plugin Tests (2 GPUs) # 40min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/plugins/
|
- vllm/plugins/
|
||||||
- tests/plugins/
|
- tests/plugins/
|
||||||
commands:
|
commands:
|
||||||
# begin platform plugin tests, all the code in-between runs on dummy platform
|
# begin platform plugin and general plugin tests, all the code in-between runs on dummy platform
|
||||||
- pip install -e ./plugins/vllm_add_dummy_platform
|
- pip install -e ./plugins/vllm_add_dummy_platform
|
||||||
- pytest -v -s plugins_tests/test_platform_plugins.py
|
- pytest -v -s plugins_tests/test_platform_plugins.py
|
||||||
- pip uninstall vllm_add_dummy_platform -y
|
- pip uninstall vllm_add_dummy_platform -y
|
||||||
@ -608,8 +728,10 @@ steps:
|
|||||||
- pytest -v -s distributed/test_distributed_oot.py
|
- pytest -v -s distributed/test_distributed_oot.py
|
||||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||||
|
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
||||||
|
|
||||||
- label: Multi-step Tests (4 GPUs) # 36min
|
- label: Multi-step Tests (4 GPUs) # 36min
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -630,6 +752,7 @@ steps:
|
|||||||
- pytest -v -s multi_step/test_correctness_llm.py
|
- pytest -v -s multi_step/test_correctness_llm.py
|
||||||
|
|
||||||
- label: Pipeline Parallelism Test # 45min
|
- label: Pipeline Parallelism Test # 45min
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -643,6 +766,7 @@ steps:
|
|||||||
- pytest -v -s distributed/test_pipeline_parallel.py
|
- pytest -v -s distributed/test_pipeline_parallel.py
|
||||||
|
|
||||||
- label: LoRA TP Test (Distributed)
|
- label: LoRA TP Test (Distributed)
|
||||||
|
mirror_hardwares: [amdexperimental, amdproduction]
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/lora
|
- vllm/lora
|
||||||
@ -658,6 +782,7 @@ steps:
|
|||||||
|
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test # 33min
|
- label: Weight Loading Multiple GPU Test # 33min
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -667,6 +792,7 @@ steps:
|
|||||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test - Large Models # optional
|
- label: Weight Loading Multiple GPU Test - Large Models # optional
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
gpu: a100
|
gpu: a100
|
||||||
@ -705,4 +831,4 @@ steps:
|
|||||||
- vllm/model_executor/layers/quantization
|
- vllm/model_executor/layers/quantization
|
||||||
commands:
|
commands:
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- bash ./run-tests.sh -c configs/models-large.txt -t 4
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||||
|
|||||||
27
.github/CODEOWNERS
vendored
27
.github/CODEOWNERS
vendored
@ -10,13 +10,21 @@
|
|||||||
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
||||||
/vllm/model_executor/guided_decoding @mgoin @russellb
|
/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm
|
||||||
/vllm/multimodal @DarkLight1337 @ywang96
|
/vllm/multimodal @DarkLight1337 @ywang96
|
||||||
CMakeLists.txt @tlrmchlsmth
|
/vllm/vllm_flash_attn @LucasWilkinson
|
||||||
|
/vllm/lora @jeejeelee
|
||||||
|
/vllm/reasoning @aarnphm
|
||||||
|
/vllm/entrypoints @aarnphm
|
||||||
|
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
||||||
|
|
||||||
|
# Any change to the VllmConfig changes can have a large user-facing impact,
|
||||||
|
# so spam a lot of people
|
||||||
|
/vllm/config.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor
|
||||||
|
|
||||||
# vLLM V1
|
# vLLM V1
|
||||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||||
/vllm/v1/structured_output @mgoin @russellb
|
/vllm/v1/structured_output @mgoin @russellb @aarnphm
|
||||||
|
|
||||||
# Test ownership
|
# Test ownership
|
||||||
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
||||||
@ -25,8 +33,8 @@ CMakeLists.txt @tlrmchlsmth
|
|||||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
||||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||||
/tests/distributed/test_same_node.py @youkaichao
|
/tests/distributed/test_same_node.py @youkaichao
|
||||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
|
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
|
||||||
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
|
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm
|
||||||
/tests/kernels @tlrmchlsmth @WoosukKwon
|
/tests/kernels @tlrmchlsmth @WoosukKwon
|
||||||
/tests/model_executor/test_guided_processors.py @mgoin @russellb
|
/tests/model_executor/test_guided_processors.py @mgoin @russellb
|
||||||
/tests/models @DarkLight1337 @ywang96
|
/tests/models @DarkLight1337 @ywang96
|
||||||
@ -36,6 +44,11 @@ CMakeLists.txt @tlrmchlsmth
|
|||||||
/tests/quantization @mgoin @robertgshaw2-redhat
|
/tests/quantization @mgoin @robertgshaw2-redhat
|
||||||
/tests/spec_decode @njhill @LiuXiaoxuanPKU
|
/tests/spec_decode @njhill @LiuXiaoxuanPKU
|
||||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
|
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
||||||
/tests/v1/structured_output @mgoin @russellb
|
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
||||||
/tests/weight_loading @mgoin @youkaichao
|
/tests/weight_loading @mgoin @youkaichao
|
||||||
|
/tests/lora @jeejeelee
|
||||||
|
|
||||||
|
# Docs
|
||||||
|
/docs @hmellor
|
||||||
|
mkdocs.yaml @hmellor
|
||||||
|
|||||||
22
.github/ISSUE_TEMPLATE/400-bug-report.yml
vendored
22
.github/ISSUE_TEMPLATE/400-bug-report.yml
vendored
@ -8,6 +8,16 @@ body:
|
|||||||
attributes:
|
attributes:
|
||||||
value: >
|
value: >
|
||||||
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: |
|
||||||
|
⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as:
|
||||||
|
- API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys)
|
||||||
|
- Passwords or authentication credentials
|
||||||
|
- Private URLs or endpoints
|
||||||
|
- Personal or confidential data
|
||||||
|
|
||||||
|
Consider redacting or replacing sensitive values with placeholders like `<YOUR_TOKEN_HERE>` when sharing configuration or code examples.
|
||||||
- type: textarea
|
- type: textarea
|
||||||
attributes:
|
attributes:
|
||||||
label: Your current environment
|
label: Your current environment
|
||||||
@ -21,12 +31,12 @@ body:
|
|||||||
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
||||||
value: |
|
value: |
|
||||||
<details>
|
<details>
|
||||||
<summary>The output of `python collect_env.py`</summary>
|
<summary>The output of <code>python collect_env.py</code></summary>
|
||||||
|
|
||||||
```text
|
```text
|
||||||
Your output of `python collect_env.py` here
|
Your output of `python collect_env.py` here
|
||||||
```
|
```
|
||||||
|
|
||||||
</details>
|
</details>
|
||||||
validations:
|
validations:
|
||||||
required: true
|
required: true
|
||||||
@ -75,20 +85,20 @@ body:
|
|||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
The error message you got, with the full traceback.
|
The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
|
||||||
```
|
```
|
||||||
validations:
|
validations:
|
||||||
required: true
|
required: true
|
||||||
- type: markdown
|
- type: markdown
|
||||||
attributes:
|
attributes:
|
||||||
value: >
|
value: |
|
||||||
⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output:
|
⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the model's output:
|
||||||
|
|
||||||
- Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
|
- Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
|
||||||
|
|
||||||
- If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
|
- If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
|
||||||
|
|
||||||
Thanks for contributing 🎉!
|
Thanks for reporting 🙏!
|
||||||
- type: checkboxes
|
- type: checkboxes
|
||||||
id: askllm
|
id: askllm
|
||||||
attributes:
|
attributes:
|
||||||
|
|||||||
69
.github/ISSUE_TEMPLATE/450-ci-failure.yml
vendored
Normal file
69
.github/ISSUE_TEMPLATE/450-ci-failure.yml
vendored
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
name: 🧪 CI failure report
|
||||||
|
description: Report a failing test.
|
||||||
|
title: "[CI Failure]: "
|
||||||
|
labels: ["ci-failure"]
|
||||||
|
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
#### Include the name of the failing Buildkite step and test file in the title.
|
||||||
|
- type: input
|
||||||
|
attributes:
|
||||||
|
label: Name of failing test
|
||||||
|
description: |
|
||||||
|
Paste in the fully-qualified name of the failing test from the logs.
|
||||||
|
placeholder: |
|
||||||
|
`path/to/test_file.py::test_name[params]`
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: checkboxes
|
||||||
|
attributes:
|
||||||
|
label: Basic information
|
||||||
|
description: Select all items that apply to the failing test.
|
||||||
|
options:
|
||||||
|
- label: Flaky test
|
||||||
|
- label: Can reproduce locally
|
||||||
|
- label: Caused by external libraries (e.g. bug in `transformers`)
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: 🧪 Describe the failing test
|
||||||
|
description: |
|
||||||
|
Please provide a clear and concise description of the failing test.
|
||||||
|
placeholder: |
|
||||||
|
A clear and concise description of the failing test.
|
||||||
|
|
||||||
|
```
|
||||||
|
The error message you got, with the full traceback and the error logs with [dump_input.py:##] if present.
|
||||||
|
```
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: 📝 History of failing test
|
||||||
|
description: |
|
||||||
|
Since when did the test start to fail?
|
||||||
|
You can look up its history via [Buildkite Test Suites](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main).
|
||||||
|
|
||||||
|
If you have time, identify the PR that caused the test to fail on main. You can do so via the following methods:
|
||||||
|
|
||||||
|
- Use Buildkite Test Suites to find the PR where the test failure first occurred, and reproduce the failure locally.
|
||||||
|
|
||||||
|
- Run [`git bisect`](https://git-scm.com/docs/git-bisect) locally.
|
||||||
|
|
||||||
|
- Manually unblock Buildkite steps for suspected PRs on main and check the results. (authorized users only)
|
||||||
|
placeholder: |
|
||||||
|
Approximate timeline and/or problematic PRs
|
||||||
|
|
||||||
|
A link to the Buildkite analytics of the failing test (if available)
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: CC List.
|
||||||
|
description: >
|
||||||
|
The list of people you want to CC. Usually, this includes those who worked on the PR that failed the test.
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
Thanks for reporting 🙏!
|
||||||
18
.github/PULL_REQUEST_TEMPLATE.md
vendored
18
.github/PULL_REQUEST_TEMPLATE.md
vendored
@ -1,6 +1,18 @@
|
|||||||
FILL IN THE PR DESCRIPTION HERE
|
## Essential Elements of an Effective PR Description Checklist
|
||||||
|
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
|
||||||
|
- [ ] The test plan, such as providing test command.
|
||||||
|
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
|
||||||
|
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
|
||||||
|
|
||||||
FIX #xxxx (*link existing issues this PR will resolve*)
|
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
|
||||||
|
|
||||||
|
## Purpose
|
||||||
|
|
||||||
|
## Test Plan
|
||||||
|
|
||||||
|
## Test Result
|
||||||
|
|
||||||
|
## (Optional) Documentation Update
|
||||||
|
|
||||||
<!--- pyml disable-next-line no-emphasis-as-heading -->
|
<!--- pyml disable-next-line no-emphasis-as-heading -->
|
||||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>** (anything written below this line will be removed by GitHub Actions)
|
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
|
||||||
|
|||||||
132
.github/mergify.yml
vendored
132
.github/mergify.yml
vendored
@ -27,6 +27,22 @@ pull_request_rules:
|
|||||||
add:
|
add:
|
||||||
- ci/build
|
- ci/build
|
||||||
|
|
||||||
|
- name: label-deepseek
|
||||||
|
description: Automatically apply deepseek label
|
||||||
|
conditions:
|
||||||
|
- or:
|
||||||
|
- files~=^examples/.*deepseek.*\.py
|
||||||
|
- files~=^tests/.*deepseek.*\.py
|
||||||
|
- files~=^vllm/entrypoints/openai/tool_parsers/.*deepseek.*\.py
|
||||||
|
- files~=^vllm/model_executor/models/.*deepseek.*\.py
|
||||||
|
- files~=^vllm/reasoning/.*deepseek.*\.py
|
||||||
|
- files~=^vllm/transformers_utils/.*deepseek.*\.py
|
||||||
|
- title~=(?i)DeepSeek
|
||||||
|
actions:
|
||||||
|
label:
|
||||||
|
add:
|
||||||
|
- deepseek
|
||||||
|
|
||||||
- name: label-frontend
|
- name: label-frontend
|
||||||
description: Automatically apply frontend label
|
description: Automatically apply frontend label
|
||||||
conditions:
|
conditions:
|
||||||
@ -36,6 +52,21 @@ pull_request_rules:
|
|||||||
add:
|
add:
|
||||||
- frontend
|
- frontend
|
||||||
|
|
||||||
|
- name: label-llama
|
||||||
|
description: Automatically apply llama label
|
||||||
|
conditions:
|
||||||
|
- or:
|
||||||
|
- files~=^examples/.*llama.*\.py
|
||||||
|
- files~=^tests/.*llama.*\.py
|
||||||
|
- files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
|
||||||
|
- files~=^vllm/model_executor/models/.*llama.*\.py
|
||||||
|
- files~=^vllm/transformers_utils/configs/.*llama.*\.py
|
||||||
|
- title~=(?i)llama
|
||||||
|
actions:
|
||||||
|
label:
|
||||||
|
add:
|
||||||
|
- llama
|
||||||
|
|
||||||
- name: label-multi-modality
|
- name: label-multi-modality
|
||||||
description: Automatically apply multi-modality label
|
description: Automatically apply multi-modality label
|
||||||
conditions:
|
conditions:
|
||||||
@ -43,14 +74,72 @@ pull_request_rules:
|
|||||||
- files~=^vllm/multimodal/
|
- files~=^vllm/multimodal/
|
||||||
- files~=^tests/multimodal/
|
- files~=^tests/multimodal/
|
||||||
- files~=^tests/models/multimodal/
|
- files~=^tests/models/multimodal/
|
||||||
- files~=^tests/models/*/audio_language/
|
|
||||||
- files~=^tests/models/*/vision_language/
|
|
||||||
- files=tests/models/test_vision.py
|
- files=tests/models/test_vision.py
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
add:
|
add:
|
||||||
- multi-modality
|
- multi-modality
|
||||||
|
|
||||||
|
- name: label-new-model
|
||||||
|
description: Automatically apply new-model label
|
||||||
|
conditions:
|
||||||
|
- and:
|
||||||
|
- files~=^vllm/model_executor/models/
|
||||||
|
- files=vllm/model_executor/models/registry.py
|
||||||
|
- files=tests/models/registry.py
|
||||||
|
- files=docs/models/supported_models.md
|
||||||
|
actions:
|
||||||
|
label:
|
||||||
|
add:
|
||||||
|
- new-model
|
||||||
|
|
||||||
|
- name: label-performance
|
||||||
|
description: Automatically apply performance label
|
||||||
|
conditions:
|
||||||
|
- or:
|
||||||
|
- files~=^benchmarks/
|
||||||
|
- files~=^vllm/benchmarks/
|
||||||
|
- files~=^tests/benchmarks/
|
||||||
|
- files~=^\.buildkite/nightly-benchmarks/
|
||||||
|
actions:
|
||||||
|
label:
|
||||||
|
add:
|
||||||
|
- performance
|
||||||
|
|
||||||
|
- name: label-qwen
|
||||||
|
description: Automatically apply qwen label
|
||||||
|
conditions:
|
||||||
|
- or:
|
||||||
|
- files~=^examples/.*qwen.*\.py
|
||||||
|
- files~=^tests/.*qwen.*\.py
|
||||||
|
- files~=^vllm/model_executor/models/.*qwen.*\.py
|
||||||
|
- files~=^vllm/reasoning/.*qwen.*\.py
|
||||||
|
- title~=(?i)Qwen
|
||||||
|
actions:
|
||||||
|
label:
|
||||||
|
add:
|
||||||
|
- qwen
|
||||||
|
|
||||||
|
- name: label-rocm
|
||||||
|
description: Automatically apply rocm label
|
||||||
|
conditions:
|
||||||
|
- or:
|
||||||
|
- files~=^csrc/rocm/
|
||||||
|
- files~=^docker/Dockerfile.rocm
|
||||||
|
- files~=^requirements/rocm.*\.txt
|
||||||
|
- files~=^vllm/attention/backends/rocm.*\.py
|
||||||
|
- files~=^vllm/attention/ops/rocm.*\.py
|
||||||
|
- files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py
|
||||||
|
- files~=^vllm/v1/attention/backends/mla/rocm.*\.py
|
||||||
|
- files~=^tests/kernels/.*_rocm.*\.py
|
||||||
|
- files=vllm/platforms/rocm.py
|
||||||
|
- title~=(?i)AMD
|
||||||
|
- title~=(?i)ROCm
|
||||||
|
actions:
|
||||||
|
label:
|
||||||
|
add:
|
||||||
|
- rocm
|
||||||
|
|
||||||
- name: label-structured-output
|
- name: label-structured-output
|
||||||
description: Automatically apply structured-output label
|
description: Automatically apply structured-output label
|
||||||
conditions:
|
conditions:
|
||||||
@ -58,7 +147,7 @@ pull_request_rules:
|
|||||||
- files~=^benchmarks/structured_schemas/
|
- files~=^benchmarks/structured_schemas/
|
||||||
- files=benchmarks/benchmark_serving_structured_output.py
|
- files=benchmarks/benchmark_serving_structured_output.py
|
||||||
- files=benchmarks/run_structured_output_benchmark.sh
|
- files=benchmarks/run_structured_output_benchmark.sh
|
||||||
- files=docs/source/features/structured_outputs.md
|
- files=docs/features/structured_outputs.md
|
||||||
- files=examples/offline_inference/structured_outputs.py
|
- files=examples/offline_inference/structured_outputs.py
|
||||||
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
|
||||||
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
|
||||||
@ -78,8 +167,14 @@ pull_request_rules:
|
|||||||
conditions:
|
conditions:
|
||||||
- or:
|
- or:
|
||||||
- files~=^vllm/spec_decode/
|
- files~=^vllm/spec_decode/
|
||||||
|
- files~=^vllm/v1/spec_decode/
|
||||||
- files=vllm/model_executor/layers/spec_decode_base_sampler.py
|
- files=vllm/model_executor/layers/spec_decode_base_sampler.py
|
||||||
- files~=^tests/spec_decode/
|
- files~=^tests/spec_decode/
|
||||||
|
- files~=^tests/v1/spec_decode/
|
||||||
|
- files~=^examples/.*(spec_decode|mlpspeculator|eagle|speculation).*\.py
|
||||||
|
- files~=^vllm/model_executor/models/.*eagle.*\.py
|
||||||
|
- files=vllm/model_executor/models/mlp_speculator.py
|
||||||
|
- files~=^vllm/transformers_utils/configs/(eagle|medusa|mlp_speculator)\.py
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
add:
|
add:
|
||||||
@ -126,6 +221,26 @@ pull_request_rules:
|
|||||||
remove:
|
remove:
|
||||||
- tpu
|
- tpu
|
||||||
|
|
||||||
|
- name: label-tool-calling
|
||||||
|
description: Automatically add tool-calling label
|
||||||
|
conditions:
|
||||||
|
- or:
|
||||||
|
- files~=^tests/tool_use/
|
||||||
|
- files~=^tests/mistral_tool_use/
|
||||||
|
- files~=^tests/entrypoints/openai/tool_parsers/
|
||||||
|
- files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
|
||||||
|
- files~=^vllm/entrypoints/openai/tool_parsers/
|
||||||
|
- files=docs/features/tool_calling.md
|
||||||
|
- files~=^examples/tool_chat_*
|
||||||
|
- files=examples/offline_inference/chat_with_tools.py
|
||||||
|
- files=examples/online_serving/openai_chat_completion_client_with_tools_required.py
|
||||||
|
- files=examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py
|
||||||
|
- files=examples/online_serving/openai_chat_completion_client_with_tools.py
|
||||||
|
actions:
|
||||||
|
label:
|
||||||
|
add:
|
||||||
|
- tool-calling
|
||||||
|
|
||||||
- name: ping author on conflicts and add 'needs-rebase' label
|
- name: ping author on conflicts and add 'needs-rebase' label
|
||||||
conditions:
|
conditions:
|
||||||
- conflict
|
- conflict
|
||||||
@ -141,6 +256,17 @@ pull_request_rules:
|
|||||||
|
|
||||||
https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
|
https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
|
||||||
|
|
||||||
|
- name: assign reviewer for tensorizer changes
|
||||||
|
conditions:
|
||||||
|
- files~=^vllm/model_executor/model_loader/tensorizer.py
|
||||||
|
- files~=^vllm/model_executor/model_loader/tensorizer_loader.py
|
||||||
|
- files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
|
- files~=^tests/tensorizer_loader/
|
||||||
|
actions:
|
||||||
|
assign:
|
||||||
|
users:
|
||||||
|
- "sangstar"
|
||||||
|
|
||||||
- name: remove 'needs-rebase' label when conflict is resolved
|
- name: remove 'needs-rebase' label when conflict is resolved
|
||||||
conditions:
|
conditions:
|
||||||
- -conflict
|
- -conflict
|
||||||
|
|||||||
2
.github/scripts/cleanup_pr_body.sh
vendored
2
.github/scripts/cleanup_pr_body.sh
vendored
@ -26,7 +26,7 @@ sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"
|
|||||||
|
|
||||||
# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
|
# Remove HTML <details> section that includes <summary> text of "PR Checklist (Click to Expand)"
|
||||||
python3 - <<EOF
|
python3 - <<EOF
|
||||||
import re
|
import regex as re
|
||||||
|
|
||||||
with open("${NEW}", "r") as file:
|
with open("${NEW}", "r") as file:
|
||||||
content = file.read()
|
content = file.read()
|
||||||
|
|||||||
2
.github/workflows/add_label_automerge.yml
vendored
2
.github/workflows/add_label_automerge.yml
vendored
@ -1,4 +1,6 @@
|
|||||||
name: Add label on auto-merge enabled
|
name: Add label on auto-merge enabled
|
||||||
|
permissions:
|
||||||
|
pull-requests: write
|
||||||
on:
|
on:
|
||||||
pull_request_target:
|
pull_request_target:
|
||||||
types:
|
types:
|
||||||
|
|||||||
7
.github/workflows/cleanup_pr_body.yml
vendored
7
.github/workflows/cleanup_pr_body.yml
vendored
@ -20,7 +20,12 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
python-version: '3.12'
|
python-version: '3.12'
|
||||||
|
|
||||||
|
- name: Install Python dependencies
|
||||||
|
run: |
|
||||||
|
python3 -m pip install --upgrade pip
|
||||||
|
python3 -m pip install regex
|
||||||
|
|
||||||
- name: Update PR description
|
- name: Update PR description
|
||||||
env:
|
env:
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
run: .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
|
run: bash .github/scripts/cleanup_pr_body.sh "${{ github.event.number }}"
|
||||||
|
|||||||
9
.github/workflows/lint-and-deploy.yaml
vendored
9
.github/workflows/lint-and-deploy.yaml
vendored
@ -2,6 +2,9 @@ name: Lint and Deploy Charts
|
|||||||
|
|
||||||
on: pull_request
|
on: pull_request
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
lint-and-deploy:
|
lint-and-deploy:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
@ -65,8 +68,8 @@ jobs:
|
|||||||
export AWS_ACCESS_KEY_ID=minioadmin
|
export AWS_ACCESS_KEY_ID=minioadmin
|
||||||
export AWS_SECRET_ACCESS_KEY=minioadmin
|
export AWS_SECRET_ACCESS_KEY=minioadmin
|
||||||
sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
|
sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
|
||||||
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
|
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set image.env[2].name=VLLM_CPU_CI_ENV --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string image.env[2].value="1" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
|
||||||
|
|
||||||
- name: curl test
|
- name: curl test
|
||||||
run: |
|
run: |
|
||||||
kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
|
kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
|
||||||
@ -79,4 +82,4 @@ jobs:
|
|||||||
"max_tokens": 7,
|
"max_tokens": 7,
|
||||||
"temperature": 0
|
"temperature": 0
|
||||||
}'):$CODE"
|
}'):$CODE"
|
||||||
echo "$CODE"
|
echo "$CODE"
|
||||||
|
|||||||
3
.github/workflows/pre-commit.yml
vendored
3
.github/workflows/pre-commit.yml
vendored
@ -5,6 +5,9 @@ on:
|
|||||||
push:
|
push:
|
||||||
branches: [main]
|
branches: [main]
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
pre-commit:
|
pre-commit:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
|||||||
2
.github/workflows/reminder_comment.yml
vendored
2
.github/workflows/reminder_comment.yml
vendored
@ -1,4 +1,6 @@
|
|||||||
name: PR Reminder Comment Bot
|
name: PR Reminder Comment Bot
|
||||||
|
permissions:
|
||||||
|
pull-requests: write
|
||||||
on:
|
on:
|
||||||
pull_request_target:
|
pull_request_target:
|
||||||
types: [opened]
|
types: [opened]
|
||||||
|
|||||||
8
.gitignore
vendored
8
.gitignore
vendored
@ -3,7 +3,6 @@
|
|||||||
|
|
||||||
# vllm-flash-attn built from source
|
# vllm-flash-attn built from source
|
||||||
vllm/vllm_flash_attn/*
|
vllm/vllm_flash_attn/*
|
||||||
!vllm/vllm_flash_attn/fa_utils.py
|
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
@ -78,10 +77,6 @@ instance/
|
|||||||
# Scrapy stuff:
|
# Scrapy stuff:
|
||||||
.scrapy
|
.scrapy
|
||||||
|
|
||||||
# Sphinx documentation
|
|
||||||
docs/_build/
|
|
||||||
docs/source/getting_started/examples/
|
|
||||||
|
|
||||||
# PyBuilder
|
# PyBuilder
|
||||||
.pybuilder/
|
.pybuilder/
|
||||||
target/
|
target/
|
||||||
@ -151,6 +146,7 @@ venv.bak/
|
|||||||
|
|
||||||
# mkdocs documentation
|
# mkdocs documentation
|
||||||
/site
|
/site
|
||||||
|
docs/examples
|
||||||
|
|
||||||
# mypy
|
# mypy
|
||||||
.mypy_cache/
|
.mypy_cache/
|
||||||
@ -204,5 +200,5 @@ benchmarks/**/*.json
|
|||||||
actionlint
|
actionlint
|
||||||
shellcheck*/
|
shellcheck*/
|
||||||
|
|
||||||
# Ingore moe/marlin_moe gen code
|
# Ignore moe/marlin_moe gen code
|
||||||
csrc/moe/marlin_moe_wna16/kernel_*
|
csrc/moe/marlin_moe_wna16/kernel_*
|
||||||
|
|||||||
@ -11,51 +11,59 @@ repos:
|
|||||||
hooks:
|
hooks:
|
||||||
- id: yapf
|
- id: yapf
|
||||||
args: [--in-place, --verbose]
|
args: [--in-place, --verbose]
|
||||||
|
# Keep the same list from yapfignore here to avoid yapf failing without any inputs
|
||||||
|
exclude: '(.buildkite|benchmarks|build|examples)/.*'
|
||||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
rev: v0.9.3
|
rev: v0.11.7
|
||||||
hooks:
|
hooks:
|
||||||
- id: ruff
|
- id: ruff
|
||||||
args: [--output-format, github, --fix]
|
args: [--output-format, github, --fix]
|
||||||
- repo: https://github.com/codespell-project/codespell
|
- id: ruff-format
|
||||||
rev: v2.4.0
|
files: ^(.buildkite|benchmarks|examples)/.*
|
||||||
|
- repo: https://github.com/crate-ci/typos
|
||||||
|
rev: v1.32.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: codespell
|
- id: typos
|
||||||
additional_dependencies: ['tomli']
|
|
||||||
args: ['--toml', 'pyproject.toml']
|
|
||||||
- repo: https://github.com/PyCQA/isort
|
- repo: https://github.com/PyCQA/isort
|
||||||
rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0
|
rev: 6.0.1
|
||||||
hooks:
|
hooks:
|
||||||
- id: isort
|
- id: isort
|
||||||
- repo: https://github.com/pre-commit/mirrors-clang-format
|
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||||
rev: v19.1.7
|
rev: v20.1.3
|
||||||
hooks:
|
hooks:
|
||||||
- id: clang-format
|
- id: clang-format
|
||||||
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
||||||
types_or: [c++, cuda]
|
types_or: [c++, cuda]
|
||||||
args: [--style=file, --verbose]
|
args: [--style=file, --verbose]
|
||||||
- repo: https://github.com/jackdewinter/pymarkdown
|
- repo: https://github.com/jackdewinter/pymarkdown
|
||||||
rev: v0.9.27
|
rev: v0.9.29
|
||||||
hooks:
|
hooks:
|
||||||
- id: pymarkdown
|
- id: pymarkdown
|
||||||
|
exclude: '.*\.inc\.md'
|
||||||
args: [fix]
|
args: [fix]
|
||||||
- repo: https://github.com/rhysd/actionlint
|
- repo: https://github.com/rhysd/actionlint
|
||||||
rev: v1.7.7
|
rev: v1.7.7
|
||||||
hooks:
|
hooks:
|
||||||
- id: actionlint
|
- id: actionlint
|
||||||
- repo: https://github.com/astral-sh/uv-pre-commit
|
- repo: https://github.com/astral-sh/uv-pre-commit
|
||||||
rev: 0.6.2
|
rev: 0.6.17
|
||||||
hooks:
|
hooks:
|
||||||
- id: pip-compile
|
- id: pip-compile
|
||||||
args: [requirements/test.in, -o, requirements/test.txt]
|
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
|
||||||
files: ^requirements/test\.(in|txt)$
|
files: ^requirements/test\.(in|txt)$
|
||||||
- repo: local
|
- repo: local
|
||||||
hooks:
|
hooks:
|
||||||
|
- id: format-torch-nightly-test
|
||||||
|
name: reformat nightly_torch_test.txt to be in sync with test.in
|
||||||
|
language: python
|
||||||
|
entry: python tools/generate_nightly_torch_test.py
|
||||||
|
files: ^requirements/test\.(in|txt)$
|
||||||
- id: mypy-local
|
- id: mypy-local
|
||||||
name: Run mypy for local Python installation
|
name: Run mypy for local Python installation
|
||||||
entry: tools/mypy.sh 0 "local"
|
entry: tools/mypy.sh 0 "local"
|
||||||
language: python
|
language: python
|
||||||
types: [python]
|
types: [python]
|
||||||
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
|
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
|
||||||
stages: [pre-commit] # Don't run in CI
|
stages: [pre-commit] # Don't run in CI
|
||||||
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
name: Run mypy for Python 3.9
|
name: Run mypy for Python 3.9
|
||||||
@ -101,8 +109,8 @@ repos:
|
|||||||
args:
|
args:
|
||||||
- -c
|
- -c
|
||||||
- |
|
- |
|
||||||
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
|
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" "$(git rev-parse --git-path COMMIT_EDITMSG)"; then
|
||||||
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
|
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> "$(git rev-parse --git-path COMMIT_EDITMSG)"
|
||||||
fi
|
fi
|
||||||
language: system
|
language: system
|
||||||
verbose: true
|
verbose: true
|
||||||
@ -112,6 +120,11 @@ repos:
|
|||||||
entry: python tools/check_spdx_header.py
|
entry: python tools/check_spdx_header.py
|
||||||
language: python
|
language: python
|
||||||
types: [python]
|
types: [python]
|
||||||
|
- id: check-root-lazy-imports
|
||||||
|
name: Check root lazy imports
|
||||||
|
entry: python tools/check_init_lazy_imports.py
|
||||||
|
language: python
|
||||||
|
types: [python]
|
||||||
- id: check-filenames
|
- id: check-filenames
|
||||||
name: Check for spaces in all filenames
|
name: Check for spaces in all filenames
|
||||||
entry: bash
|
entry: bash
|
||||||
@ -125,12 +138,39 @@ repos:
|
|||||||
name: Update Dockerfile dependency graph
|
name: Update Dockerfile dependency graph
|
||||||
entry: tools/update-dockerfile-graph.sh
|
entry: tools/update-dockerfile-graph.sh
|
||||||
language: script
|
language: script
|
||||||
files: ^docker/Dockerfile$
|
- id: enforce-import-regex-instead-of-re
|
||||||
|
name: Enforce import regex as re
|
||||||
|
entry: python tools/enforce_regex_import.py
|
||||||
|
language: python
|
||||||
|
types: [python]
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
|
additional_dependencies: [regex]
|
||||||
|
# forbid directly import triton
|
||||||
|
- id: forbid-direct-triton-import
|
||||||
|
name: "Forbid direct 'import triton'"
|
||||||
|
entry: python tools/check_triton_import.py
|
||||||
|
language: python
|
||||||
|
types: [python]
|
||||||
|
pass_filenames: false
|
||||||
|
additional_dependencies: [regex]
|
||||||
|
- id: check-pickle-imports
|
||||||
|
name: Prevent new pickle/cloudpickle imports
|
||||||
|
entry: python tools/check_pickle_imports.py
|
||||||
|
language: python
|
||||||
|
types: [python]
|
||||||
|
pass_filenames: false
|
||||||
|
additional_dependencies: [pathspec, regex]
|
||||||
|
- id: validate-config
|
||||||
|
name: Validate configuration has default values and that each field has a docstring
|
||||||
|
entry: python tools/validate_config.py
|
||||||
|
language: python
|
||||||
|
types: [python]
|
||||||
|
pass_filenames: true
|
||||||
|
files: vllm/config.py|tests/test_config.py
|
||||||
# Keep `suggestion` last
|
# Keep `suggestion` last
|
||||||
- id: suggestion
|
- id: suggestion
|
||||||
name: Suggestion
|
name: Suggestion
|
||||||
entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
|
entry: bash -c 'echo "To bypass all the pre-commit hooks, add --no-verify to git commit. To skip a specific hook, prefix the commit command with SKIP=<hook-id>."'
|
||||||
language: system
|
language: system
|
||||||
verbose: true
|
verbose: true
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
|
|||||||
@ -8,12 +8,8 @@ build:
|
|||||||
tools:
|
tools:
|
||||||
python: "3.12"
|
python: "3.12"
|
||||||
|
|
||||||
sphinx:
|
mkdocs:
|
||||||
configuration: docs/source/conf.py
|
configuration: mkdocs.yaml
|
||||||
fail_on_warning: true
|
|
||||||
|
|
||||||
# If using Sphinx, optionally build your docs in additional formats such as PDF
|
|
||||||
formats: []
|
|
||||||
|
|
||||||
# Optionally declare the Python requirements required to build your docs
|
# Optionally declare the Python requirements required to build your docs
|
||||||
python:
|
python:
|
||||||
|
|||||||
234
CMakeLists.txt
234
CMakeLists.txt
@ -15,7 +15,6 @@ project(vllm_extensions LANGUAGES CXX)
|
|||||||
|
|
||||||
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
|
||||||
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
|
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
|
||||||
|
|
||||||
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
||||||
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
|
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
|
||||||
|
|
||||||
@ -24,15 +23,15 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
|
|||||||
# Suppress potential warnings about unused manually-specified variables
|
# Suppress potential warnings about unused manually-specified variables
|
||||||
set(ignoreMe "${VLLM_PYTHON_PATH}")
|
set(ignoreMe "${VLLM_PYTHON_PATH}")
|
||||||
|
|
||||||
|
# Prevent installation of dependencies (cutlass) by default.
|
||||||
|
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
||||||
|
|
||||||
#
|
#
|
||||||
# Supported python versions. These versions will be searched in order, the
|
# Supported python versions. These versions will be searched in order, the
|
||||||
# first match will be selected. These should be kept in sync with setup.py.
|
# first match will be selected. These should be kept in sync with setup.py.
|
||||||
#
|
#
|
||||||
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
|
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
|
||||||
|
|
||||||
# Supported NVIDIA architectures.
|
|
||||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
|
||||||
|
|
||||||
# Supported AMD GPU architectures.
|
# Supported AMD GPU architectures.
|
||||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
|
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
|
||||||
|
|
||||||
@ -46,8 +45,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
|
|||||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||||
# versions are derived from docker/Dockerfile.rocm
|
# versions are derived from docker/Dockerfile.rocm
|
||||||
#
|
#
|
||||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
|
set(TORCH_SUPPORTED_VERSION_CUDA "2.7.0")
|
||||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
|
set(TORCH_SUPPORTED_VERSION_ROCM "2.7.0")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Try to find python package with an executable that exactly matches
|
# Try to find python package with an executable that exactly matches
|
||||||
@ -80,6 +79,15 @@ endif()
|
|||||||
#
|
#
|
||||||
find_package(Torch REQUIRED)
|
find_package(Torch REQUIRED)
|
||||||
|
|
||||||
|
# Supported NVIDIA architectures.
|
||||||
|
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
|
||||||
|
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
|
||||||
|
CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
|
||||||
|
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
||||||
|
else()
|
||||||
|
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
|
||||||
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# Forward the non-CUDA device extensions to external CMake scripts.
|
# Forward the non-CUDA device extensions to external CMake scripts.
|
||||||
#
|
#
|
||||||
@ -174,9 +182,6 @@ include(FetchContent)
|
|||||||
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
|
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
|
||||||
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
|
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
|
||||||
|
|
||||||
#
|
|
||||||
# Set rocm version dev int.
|
|
||||||
#
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "HIP")
|
if(VLLM_GPU_LANG STREQUAL "HIP")
|
||||||
#
|
#
|
||||||
# Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
|
# Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
|
||||||
@ -184,7 +189,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
|
|||||||
set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
|
set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
|
||||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
|
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
|
||||||
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
|
# Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
|
||||||
# a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
|
# a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
|
||||||
@ -227,20 +231,25 @@ endif()
|
|||||||
#
|
#
|
||||||
|
|
||||||
set(VLLM_EXT_SRC
|
set(VLLM_EXT_SRC
|
||||||
|
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
|
||||||
|
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
||||||
"csrc/cache_kernels.cu"
|
"csrc/cache_kernels.cu"
|
||||||
"csrc/attention/paged_attention_v1.cu"
|
"csrc/attention/paged_attention_v1.cu"
|
||||||
"csrc/attention/paged_attention_v2.cu"
|
"csrc/attention/paged_attention_v2.cu"
|
||||||
"csrc/attention/merge_attn_states.cu"
|
"csrc/attention/merge_attn_states.cu"
|
||||||
|
"csrc/attention/vertical_slash_index.cu"
|
||||||
"csrc/pos_encoding_kernels.cu"
|
"csrc/pos_encoding_kernels.cu"
|
||||||
"csrc/activation_kernels.cu"
|
"csrc/activation_kernels.cu"
|
||||||
"csrc/layernorm_kernels.cu"
|
"csrc/layernorm_kernels.cu"
|
||||||
"csrc/layernorm_quant_kernels.cu"
|
"csrc/layernorm_quant_kernels.cu"
|
||||||
|
"csrc/sampler.cu"
|
||||||
"csrc/cuda_view.cu"
|
"csrc/cuda_view.cu"
|
||||||
"csrc/quantization/gptq/q_gemm.cu"
|
"csrc/quantization/gptq/q_gemm.cu"
|
||||||
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
||||||
"csrc/quantization/fp8/common.cu"
|
"csrc/quantization/fp8/common.cu"
|
||||||
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
||||||
"csrc/quantization/gguf/gguf_kernel.cu"
|
"csrc/quantization/gguf/gguf_kernel.cu"
|
||||||
|
"csrc/quantization/activation_kernels.cu"
|
||||||
"csrc/cuda_utils_kernels.cu"
|
"csrc/cuda_utils_kernels.cu"
|
||||||
"csrc/prepare_inputs/advance_step.cu"
|
"csrc/prepare_inputs/advance_step.cu"
|
||||||
"csrc/custom_all_reduce.cu"
|
"csrc/custom_all_reduce.cu"
|
||||||
@ -249,9 +258,8 @@ set(VLLM_EXT_SRC
|
|||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
||||||
|
|
||||||
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
|
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
|
||||||
# Please keep this in sync with FetchContent_Declare line below.
|
set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
|
||||||
set(CUTLASS_REVISION "v3.8.0" CACHE STRING "CUTLASS revision to use")
|
|
||||||
|
|
||||||
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
||||||
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
||||||
@ -269,7 +277,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
cutlass
|
cutlass
|
||||||
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
||||||
# Please keep this in sync with CUTLASS_REVISION line above.
|
# Please keep this in sync with CUTLASS_REVISION line above.
|
||||||
GIT_TAG v3.8.0
|
GIT_TAG ${CUTLASS_REVISION}
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
|
|
||||||
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
||||||
@ -281,16 +289,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
FetchContent_MakeAvailable(cutlass)
|
FetchContent_MakeAvailable(cutlass)
|
||||||
|
|
||||||
list(APPEND VLLM_EXT_SRC
|
list(APPEND VLLM_EXT_SRC
|
||||||
"csrc/mamba/mamba_ssm/selective_scan_fwd.cu"
|
|
||||||
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
|
||||||
"csrc/quantization/aqlm/gemm_kernels.cu"
|
"csrc/quantization/aqlm/gemm_kernels.cu"
|
||||||
"csrc/quantization/awq/gemm_kernels.cu"
|
"csrc/quantization/awq/gemm_kernels.cu"
|
||||||
"csrc/permute_cols.cu"
|
"csrc/permute_cols.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
||||||
|
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
|
||||||
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
||||||
"csrc/cutlass_extensions/common.cpp")
|
"csrc/cutlass_extensions/common.cpp"
|
||||||
|
"csrc/attention/mla/cutlass_mla_entry.cu")
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${VLLM_EXT_SRC}"
|
SRCS "${VLLM_EXT_SRC}"
|
||||||
@ -299,10 +307,55 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# Only build Marlin kernels if we are building for at least some compatible archs.
|
# Only build Marlin kernels if we are building for at least some compatible archs.
|
||||||
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
||||||
# are not supported by Machete yet.
|
# are not supported by Machete yet.
|
||||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
|
# 9.0 for latest bf16 atomicAdd PTX
|
||||||
|
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
|
||||||
if (MARLIN_ARCHS)
|
if (MARLIN_ARCHS)
|
||||||
|
|
||||||
|
#
|
||||||
|
# For the Marlin kernels we automatically generate sources for various
|
||||||
|
# preselected input type pairs and schedules.
|
||||||
|
# Generate sources:
|
||||||
|
set(MARLIN_GEN_SCRIPT
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/csrc/quantization/gptq_marlin/generate_kernels.py)
|
||||||
|
file(MD5 ${MARLIN_GEN_SCRIPT} MARLIN_GEN_SCRIPT_HASH)
|
||||||
|
|
||||||
|
message(STATUS "Marlin generation script hash: ${MARLIN_GEN_SCRIPT_HASH}")
|
||||||
|
message(STATUS "Last run Marlin generate script hash: $CACHE{MARLIN_GEN_SCRIPT_HASH}")
|
||||||
|
|
||||||
|
if (NOT DEFINED CACHE{MARLIN_GEN_SCRIPT_HASH}
|
||||||
|
OR NOT $CACHE{MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MARLIN_GEN_SCRIPT_HASH})
|
||||||
|
execute_process(
|
||||||
|
COMMAND ${CMAKE_COMMAND} -E env
|
||||||
|
PYTHONPATH=$PYTHONPATH
|
||||||
|
${Python_EXECUTABLE} ${MARLIN_GEN_SCRIPT}
|
||||||
|
RESULT_VARIABLE marlin_generation_result
|
||||||
|
OUTPUT_VARIABLE marlin_generation_result
|
||||||
|
OUTPUT_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
|
||||||
|
ERROR_FILE ${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log
|
||||||
|
)
|
||||||
|
|
||||||
|
if (NOT marlin_generation_result EQUAL 0)
|
||||||
|
message(FATAL_ERROR "Marlin generation failed."
|
||||||
|
" Result: \"${marlin_generation_result}\""
|
||||||
|
"\nCheck the log for details: "
|
||||||
|
"${CMAKE_CURRENT_BINARY_DIR}/marlin_generation.log")
|
||||||
|
else()
|
||||||
|
set(MARLIN_GEN_SCRIPT_HASH ${MARLIN_GEN_SCRIPT_HASH}
|
||||||
|
CACHE STRING "Last run Marlin generate script hash" FORCE)
|
||||||
|
message(STATUS "Marlin generation completed successfully.")
|
||||||
|
endif()
|
||||||
|
else()
|
||||||
|
message(STATUS "Marlin generation script has not changed, skipping generation.")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
file(GLOB MARLIN_TEMPLATE_KERNEL_SRC "csrc/quantization/gptq_marlin/kernel_*.cu")
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
|
||||||
|
CUDA_ARCHS "${MARLIN_ARCHS}")
|
||||||
|
|
||||||
|
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
|
||||||
|
|
||||||
set(MARLIN_SRCS
|
set(MARLIN_SRCS
|
||||||
"csrc/quantization/fp8/fp8_marlin.cu"
|
|
||||||
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
|
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
|
||||||
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
||||||
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
|
"csrc/quantization/marlin/qqq/marlin_qqq_gemm_kernel.cu"
|
||||||
@ -367,13 +420,44 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
|
|
||||||
|
# The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
|
||||||
# CUDA 12.8 or later
|
# CUDA 12.8 or later
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
|
||||||
|
set(SRCS
|
||||||
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
|
||||||
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
|
||||||
|
)
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${SRCS}"
|
||||||
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM120=1")
|
||||||
|
# Let scaled_mm_c2x know it doesn't need to build these arches
|
||||||
|
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
|
message(STATUS "Building scaled_mm_c3x_sm120 for archs: ${SCALED_MM_ARCHS}")
|
||||||
|
else()
|
||||||
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
|
||||||
|
message(STATUS "Not building scaled_mm_c3x_sm120 as CUDA Compiler version is "
|
||||||
|
"not >= 12.8, we recommend upgrading to CUDA 12.8 or "
|
||||||
|
"later if you intend on running FP8 quantized models on "
|
||||||
|
"Blackwell.")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building scaled_mm_c3x_120 as no compatible archs found "
|
||||||
|
"in CUDA target architectures")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
|
||||||
|
# require CUDA 12.8 or later
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
|
||||||
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
|
||||||
)
|
)
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
@ -398,8 +482,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
#
|
#
|
||||||
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
|
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
|
||||||
# kernels for the remaining archs that are not already built for 3x.
|
# kernels for the remaining archs that are not already built for 3x.
|
||||||
|
# (Build 8.9 for FP8)
|
||||||
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
|
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
|
||||||
"7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
|
"7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}")
|
||||||
# subtract out the archs that are already built for 3x
|
# subtract out the archs that are already built for 3x
|
||||||
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
||||||
if (SCALED_MM_2X_ARCHS)
|
if (SCALED_MM_2X_ARCHS)
|
||||||
@ -450,12 +535,15 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu")
|
"csrc/quantization/fp4/nvfp4_experts_quant.cu"
|
||||||
|
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu"
|
||||||
|
"csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${FP4_ARCHS}")
|
CUDA_ARCHS "${FP4_ARCHS}")
|
||||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
|
||||||
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
|
||||||
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
|
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
|
||||||
else()
|
else()
|
||||||
message(STATUS "Not building NVFP4 as no compatible archs were found.")
|
message(STATUS "Not building NVFP4 as no compatible archs were found.")
|
||||||
@ -463,16 +551,34 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
set(FP4_ARCHS)
|
set(FP4_ARCHS)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
#
|
# CUTLASS MLA Archs and flags
|
||||||
|
cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND MLA_ARCHS)
|
||||||
|
set(SRCS
|
||||||
|
"csrc/attention/mla/cutlass_mla_kernels.cu")
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${SRCS}"
|
||||||
|
CUDA_ARCHS "${MLA_ARCHS}")
|
||||||
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MLA=1")
|
||||||
|
# Add MLA-specific include directories only to MLA source files
|
||||||
|
set_source_files_properties(${SRCS}
|
||||||
|
PROPERTIES INCLUDE_DIRECTORIES "${CUTLASS_DIR}/examples/77_blackwell_fmha;${CUTLASS_DIR}/examples/common")
|
||||||
|
message(STATUS "Building CUTLASS MLA for archs: ${MLA_ARCHS}")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building CUTLASS MLA as no compatible archs were found.")
|
||||||
|
# clear MLA_ARCHS
|
||||||
|
set(MLA_ARCHS)
|
||||||
|
endif()
|
||||||
|
|
||||||
# CUTLASS MoE kernels
|
# CUTLASS MoE kernels
|
||||||
|
|
||||||
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
|
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works
|
||||||
# on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
|
# on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
|
||||||
# to compile MoE kernels that use its output.
|
# if it's possible to compile MoE kernels that use its output.
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu")
|
||||||
"csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
@ -486,6 +592,46 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
"if you intend on running FP8 quantized MoE models on Hopper.")
|
"if you intend on running FP8 quantized MoE models on Hopper.")
|
||||||
else()
|
else()
|
||||||
message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
|
message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
|
||||||
|
"in CUDA target architectures.")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# moe_data.cu is used by all CUTLASS MoE kernels.
|
||||||
|
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
|
||||||
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${SRCS}"
|
||||||
|
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
|
||||||
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
|
message(STATUS "Building moe_data for archs: ${CUTLASS_MOE_DATA_ARCHS}")
|
||||||
|
else()
|
||||||
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
|
||||||
|
message(STATUS "Not building moe_data as CUDA Compiler version is "
|
||||||
|
"not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
|
||||||
|
"if you intend on running FP8 quantized MoE models on Hopper or Blackwell.")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building moe_data as no compatible archs found "
|
||||||
|
"in CUDA target architectures.")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${SRCS}"
|
||||||
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
|
||||||
|
message(STATUS "Building blockwise_scaled_group_mm_sm100 for archs: ${SCALED_MM_ARCHS}")
|
||||||
|
else()
|
||||||
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
|
message(STATUS "Not building blockwise_scaled_group_mm_sm100 kernels as CUDA Compiler version is "
|
||||||
|
"not >= 12.8, we recommend upgrading to CUDA 12.8 or later "
|
||||||
|
"if you intend on running FP8 quantized MoE models on Blackwell.")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building blockwise_scaled_group_mm_sm100 as no compatible archs found "
|
||||||
"in CUDA target architectures")
|
"in CUDA target architectures")
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
@ -562,6 +708,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# if CUDA endif
|
# if CUDA endif
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (VLLM_GPU_LANG STREQUAL "HIP")
|
||||||
|
# Add QuickReduce kernels
|
||||||
|
list(APPEND VLLM_EXT_SRC
|
||||||
|
"csrc/custom_quickreduce.cu"
|
||||||
|
)
|
||||||
|
# if ROCM endif
|
||||||
|
endif()
|
||||||
|
|
||||||
message(STATUS "Enabling C extension.")
|
message(STATUS "Enabling C extension.")
|
||||||
define_gpu_extension_target(
|
define_gpu_extension_target(
|
||||||
_C
|
_C
|
||||||
@ -607,7 +761,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||||
|
|
||||||
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
|
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
|
||||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
|
# 9.0 for latest bf16 atomicAdd PTX
|
||||||
|
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
|
||||||
if (MARLIN_MOE_ARCHS)
|
if (MARLIN_MOE_ARCHS)
|
||||||
|
|
||||||
#
|
#
|
||||||
@ -625,7 +780,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
|
OR NOT $CACHE{MOE_MARLIN_GEN_SCRIPT_HASH} STREQUAL ${MOE_MARLIN_GEN_SCRIPT_HASH})
|
||||||
execute_process(
|
execute_process(
|
||||||
COMMAND ${CMAKE_COMMAND} -E env
|
COMMAND ${CMAKE_COMMAND} -E env
|
||||||
PYTHONPATH=${CMAKE_CURRENT_SOURCE_DIR}/csrc/cutlass_extensions/:${CUTLASS_DIR}/python/:${VLLM_PYTHON_PATH}:$PYTHONPATH
|
PYTHONPATH=$PYTHONPATH
|
||||||
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
|
${Python_EXECUTABLE} ${MOE_MARLIN_GEN_SCRIPT}
|
||||||
RESULT_VARIABLE moe_marlin_generation_result
|
RESULT_VARIABLE moe_marlin_generation_result
|
||||||
OUTPUT_VARIABLE moe_marlin_generation_output
|
OUTPUT_VARIABLE moe_marlin_generation_output
|
||||||
@ -661,6 +816,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
|
set(MOE_PERMUTE_SRC
|
||||||
|
"csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu"
|
||||||
|
"csrc/moe/moe_permute_unpermute_op.cu")
|
||||||
|
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${MARLIN_PERMUTE_SRC}"
|
||||||
|
CUDA_ARCHS "${MOE_PERMUTE_ARCHS}")
|
||||||
|
|
||||||
|
list(APPEND VLLM_MOE_EXT_SRC "${MOE_PERMUTE_SRC}")
|
||||||
|
endif()
|
||||||
message(STATUS "Enabling moe extension.")
|
message(STATUS "Enabling moe extension.")
|
||||||
define_gpu_extension_target(
|
define_gpu_extension_target(
|
||||||
_moe_C
|
_moe_C
|
||||||
@ -669,6 +835,8 @@ define_gpu_extension_target(
|
|||||||
SOURCES ${VLLM_MOE_EXT_SRC}
|
SOURCES ${VLLM_MOE_EXT_SRC}
|
||||||
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
||||||
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
||||||
|
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
|
||||||
|
INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
|
||||||
USE_SABI 3
|
USE_SABI 3
|
||||||
WITH_SOABI)
|
WITH_SOABI)
|
||||||
|
|
||||||
@ -695,5 +863,7 @@ endif()
|
|||||||
# For CUDA we also build and ship some external projects.
|
# For CUDA we also build and ship some external projects.
|
||||||
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
include(cmake/external_projects/flashmla.cmake)
|
include(cmake/external_projects/flashmla.cmake)
|
||||||
|
|
||||||
|
# vllm-flash-attn should be last as it overwrites some CMake functions
|
||||||
include(cmake/external_projects/vllm_flash_attn.cmake)
|
include(cmake/external_projects/vllm_flash_attn.cmake)
|
||||||
endif ()
|
endif ()
|
||||||
|
|||||||
@ -1,3 +1,3 @@
|
|||||||
# Contributing to vLLM
|
# Contributing to vLLM
|
||||||
|
|
||||||
You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing/overview.html).
|
You may find information about contributing to vLLM on [docs.vllm.ai](https://docs.vllm.ai/en/latest/contributing).
|
||||||
|
|||||||
38
README.md
38
README.md
@ -1,7 +1,7 @@
|
|||||||
<p align="center">
|
<p align="center">
|
||||||
<picture>
|
<picture>
|
||||||
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-dark.png">
|
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-dark.png">
|
||||||
<img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png" width=55%>
|
<img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-light.png" width=55%>
|
||||||
</picture>
|
</picture>
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
@ -16,18 +16,20 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
---
|
---
|
||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
|
- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
|
||||||
|
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
|
||||||
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
|
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
|
||||||
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
|
|
||||||
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
|
|
||||||
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
|
|
||||||
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
|
|
||||||
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
||||||
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
|
|
||||||
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
|
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary>Previous News</summary>
|
<summary>Previous News</summary>
|
||||||
|
|
||||||
|
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
|
||||||
|
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
|
||||||
|
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
|
||||||
|
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
|
||||||
|
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
|
||||||
|
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
|
||||||
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
|
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
|
||||||
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
|
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
|
||||||
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
|
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
|
||||||
@ -56,8 +58,8 @@ vLLM is fast with:
|
|||||||
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
|
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
|
||||||
- Continuous batching of incoming requests
|
- Continuous batching of incoming requests
|
||||||
- Fast model execution with CUDA/HIP graph
|
- Fast model execution with CUDA/HIP graph
|
||||||
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
|
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516), INT4, INT8, and FP8
|
||||||
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
|
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer
|
||||||
- Speculative decoding
|
- Speculative decoding
|
||||||
- Chunked prefill
|
- Chunked prefill
|
||||||
|
|
||||||
@ -70,14 +72,14 @@ vLLM is flexible and easy to use with:
|
|||||||
- Tensor parallelism and pipeline parallelism support for distributed inference
|
- Tensor parallelism and pipeline parallelism support for distributed inference
|
||||||
- Streaming outputs
|
- Streaming outputs
|
||||||
- OpenAI-compatible API server
|
- OpenAI-compatible API server
|
||||||
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
|
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron
|
||||||
- Prefix caching support
|
- Prefix caching support
|
||||||
- Multi-lora support
|
- Multi-LoRA support
|
||||||
|
|
||||||
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
|
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
|
||||||
- Transformer-like LLMs (e.g., Llama)
|
- Transformer-like LLMs (e.g., Llama)
|
||||||
- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
|
- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
|
||||||
- Embedding Models (e.g. E5-Mistral)
|
- Embedding Models (e.g., E5-Mistral)
|
||||||
- Multi-modal LLMs (e.g., LLaVA)
|
- Multi-modal LLMs (e.g., LLaVA)
|
||||||
|
|
||||||
Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
|
Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
|
||||||
@ -98,14 +100,14 @@ Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
|
|||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
We welcome and value any contributions and collaborations.
|
We welcome and value any contributions and collaborations.
|
||||||
Please check out [Contributing to vLLM](https://docs.vllm.ai/en/stable/contributing/overview.html) for how to get involved.
|
Please check out [Contributing to vLLM](https://docs.vllm.ai/en/latest/contributing/index.html) for how to get involved.
|
||||||
|
|
||||||
## Sponsors
|
## Sponsors
|
||||||
|
|
||||||
vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
|
vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
|
||||||
|
|
||||||
<!-- Note: Please sort them in alphabetical order. -->
|
<!-- Note: Please sort them in alphabetical order. -->
|
||||||
<!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
|
<!-- Note: Please keep these consistent with docs/community/sponsors.md -->
|
||||||
Cash Donations:
|
Cash Donations:
|
||||||
- a16z
|
- a16z
|
||||||
- Dropbox
|
- Dropbox
|
||||||
@ -152,12 +154,14 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
|
|||||||
|
|
||||||
## Contact Us
|
## Contact Us
|
||||||
|
|
||||||
|
<!-- --8<-- [start:contact-us] -->
|
||||||
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
|
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
|
||||||
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
|
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
|
||||||
- coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
|
- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
|
||||||
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
|
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
|
||||||
- For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
|
- For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
|
||||||
|
<!-- --8<-- [end:contact-us] -->
|
||||||
|
|
||||||
## Media Kit
|
## Media Kit
|
||||||
|
|
||||||
- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
|
- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit)
|
||||||
|
|||||||
@ -8,4 +8,6 @@ Please report security issues privately using [the vulnerability submission form
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
|
||||||
|
|
||||||
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
|
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
|
||||||
|
|||||||
@ -4,7 +4,7 @@ This README guides you through running benchmark tests with the extensive
|
|||||||
datasets supported on vLLM. It’s a living document, updated as new features and datasets
|
datasets supported on vLLM. It’s a living document, updated as new features and datasets
|
||||||
become available.
|
become available.
|
||||||
|
|
||||||
## Dataset Overview
|
**Dataset Overview**
|
||||||
|
|
||||||
<table style="width:100%; border-collapse: collapse;">
|
<table style="width:100%; border-collapse: collapse;">
|
||||||
<thead>
|
<thead>
|
||||||
@ -64,6 +64,12 @@ become available.
|
|||||||
<td style="text-align: center;">✅</td>
|
<td style="text-align: center;">✅</td>
|
||||||
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
|
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
|
||||||
</tr>
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><strong>Custom</strong></td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td>Local file: <code>data.jsonl</code></td>
|
||||||
|
</tr>
|
||||||
</tbody>
|
</tbody>
|
||||||
</table>
|
</table>
|
||||||
|
|
||||||
@ -76,7 +82,10 @@ become available.
|
|||||||
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
|
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
|
||||||
|
|
||||||
---
|
---
|
||||||
## Example - Online Benchmark
|
<details>
|
||||||
|
<summary><b>🚀 Example - Online Benchmark</b></summary>
|
||||||
|
|
||||||
|
<br/>
|
||||||
|
|
||||||
First start serving your model
|
First start serving your model
|
||||||
|
|
||||||
@ -124,7 +133,40 @@ P99 ITL (ms): 8.39
|
|||||||
==================================================
|
==================================================
|
||||||
```
|
```
|
||||||
|
|
||||||
### VisionArena Benchmark for Vision Language Models
|
**Custom Dataset**
|
||||||
|
|
||||||
|
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
|
||||||
|
|
||||||
|
```
|
||||||
|
{"prompt": "What is the capital of India?"}
|
||||||
|
{"prompt": "What is the capital of Iran?"}
|
||||||
|
{"prompt": "What is the capital of China?"}
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# start server
|
||||||
|
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# run benchmarking script
|
||||||
|
python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
|
||||||
|
--backend vllm \
|
||||||
|
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||||
|
--endpoint /v1/completions \
|
||||||
|
--dataset-name custom \
|
||||||
|
--dataset-path <path-to-your-data-jsonl> \
|
||||||
|
--custom-skip-chat-template \
|
||||||
|
--num-prompts 80 \
|
||||||
|
--max-concurrency 1 \
|
||||||
|
--temperature=0.3 \
|
||||||
|
--top-p=0.75 \
|
||||||
|
--result-dir "./log/"
|
||||||
|
```
|
||||||
|
|
||||||
|
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
|
||||||
|
|
||||||
|
**VisionArena Benchmark for Vision Language Models**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# need a model with vision capability here
|
# need a model with vision capability here
|
||||||
@ -142,14 +184,13 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
|||||||
--num-prompts 1000
|
--num-prompts 1000
|
||||||
```
|
```
|
||||||
|
|
||||||
### InstructCoder Benchmark with Speculative Decoding
|
**InstructCoder Benchmark with Speculative Decoding**
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
||||||
--speculative-model "[ngram]" \
|
--speculative-config $'{"method": "ngram",
|
||||||
--ngram_prompt_lookup_min 2 \
|
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
||||||
--ngram-prompt-lookup-max 5 \
|
"prompt_lookup_min": 2}'
|
||||||
--num_speculative_tokens 5
|
|
||||||
```
|
```
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
@ -160,7 +201,7 @@ python3 benchmarks/benchmark_serving.py \
|
|||||||
--num-prompts 2048
|
--num-prompts 2048
|
||||||
```
|
```
|
||||||
|
|
||||||
### Other HuggingFaceDataset Examples
|
**Other HuggingFaceDataset Examples**
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
||||||
@ -204,7 +245,17 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
|||||||
--seed 42
|
--seed 42
|
||||||
```
|
```
|
||||||
|
|
||||||
### Running With Sampling Parameters
|
**`philschmid/mt-bench`**
|
||||||
|
|
||||||
|
``` bash
|
||||||
|
python3 vllm/benchmarks/benchmark_serving.py \
|
||||||
|
--model Qwen/QwQ-32B \
|
||||||
|
--dataset-name hf \
|
||||||
|
--dataset-path philschmid/mt-bench \
|
||||||
|
--num-prompts 80
|
||||||
|
```
|
||||||
|
|
||||||
|
**Running With Sampling Parameters**
|
||||||
|
|
||||||
When using OpenAI-compatible backends such as `vllm`, optional sampling
|
When using OpenAI-compatible backends such as `vllm`, optional sampling
|
||||||
parameters can be specified. Example client command:
|
parameters can be specified. Example client command:
|
||||||
@ -222,8 +273,27 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
|||||||
--num-prompts 10
|
--num-prompts 10
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
**Running With Ramp-Up Request Rate**
|
||||||
## Example - Offline Throughput Benchmark
|
|
||||||
|
The benchmark tool also supports ramping up the request rate over the
|
||||||
|
duration of the benchmark run. This can be useful for stress testing the
|
||||||
|
server or finding the maximum throughput that it can handle, given some latency budget.
|
||||||
|
|
||||||
|
Two ramp-up strategies are supported:
|
||||||
|
- `linear`: Increases the request rate linearly from a start value to an end value.
|
||||||
|
- `exponential`: Increases the request rate exponentially.
|
||||||
|
|
||||||
|
The following arguments can be used to control the ramp-up:
|
||||||
|
- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
|
||||||
|
- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
|
||||||
|
- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary><b>📈 Example - Offline Throughput Benchmark</b></summary>
|
||||||
|
|
||||||
|
<br/>
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||||
@ -241,7 +311,7 @@ Total num prompt tokens: 5014
|
|||||||
Total num output tokens: 1500
|
Total num output tokens: 1500
|
||||||
```
|
```
|
||||||
|
|
||||||
### VisionArena Benchmark for Vision Language Models
|
**VisionArena Benchmark for Vision Language Models**
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||||
@ -261,7 +331,7 @@ Total num prompt tokens: 14527
|
|||||||
Total num output tokens: 1280
|
Total num output tokens: 1280
|
||||||
```
|
```
|
||||||
|
|
||||||
### InstructCoder Benchmark with Speculative Decoding
|
**InstructCoder Benchmark with Speculative Decoding**
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||||
@ -274,10 +344,9 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
|||||||
--output-len=100 \
|
--output-len=100 \
|
||||||
--num-prompts=2048 \
|
--num-prompts=2048 \
|
||||||
--async-engine \
|
--async-engine \
|
||||||
--speculative-model="[ngram]" \
|
--speculative-config $'{"method": "ngram",
|
||||||
--ngram_prompt_lookup_min=2 \
|
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
||||||
--ngram-prompt-lookup-max=5 \
|
"prompt_lookup_min": 2}'
|
||||||
--num_speculative_tokens=5
|
|
||||||
```
|
```
|
||||||
|
|
||||||
```
|
```
|
||||||
@ -286,7 +355,7 @@ Total num prompt tokens: 261136
|
|||||||
Total num output tokens: 204800
|
Total num output tokens: 204800
|
||||||
```
|
```
|
||||||
|
|
||||||
### Other HuggingFaceDataset Examples
|
**Other HuggingFaceDataset Examples**
|
||||||
|
|
||||||
**`lmms-lab/LLaVA-OneVision-Data`**
|
**`lmms-lab/LLaVA-OneVision-Data`**
|
||||||
|
|
||||||
@ -325,7 +394,7 @@ python3 benchmarks/benchmark_throughput.py \
|
|||||||
--num-prompts 10
|
--num-prompts 10
|
||||||
```
|
```
|
||||||
|
|
||||||
### Benchmark with LoRA Adapters
|
**Benchmark with LoRA Adapters**
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
# download dataset
|
# download dataset
|
||||||
@ -341,3 +410,196 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
|||||||
--enable-lora \
|
--enable-lora \
|
||||||
--lora-path yard1/llama-2-7b-sql-lora-test
|
--lora-path yard1/llama-2-7b-sql-lora-test
|
||||||
```
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary><b>🛠️ Example - Structured Output Benchmark</b></summary>
|
||||||
|
|
||||||
|
<br/>
|
||||||
|
|
||||||
|
Benchmark the performance of structured output generation (JSON, grammar, regex).
|
||||||
|
|
||||||
|
**Server Setup**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
|
||||||
|
```
|
||||||
|
|
||||||
|
**JSON Schema Benchmark**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||||
|
--backend vllm \
|
||||||
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
|
--dataset json \
|
||||||
|
--structured-output-ratio 1.0 \
|
||||||
|
--request-rate 10 \
|
||||||
|
--num-prompts 1000
|
||||||
|
```
|
||||||
|
|
||||||
|
**Grammar-based Generation Benchmark**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||||
|
--backend vllm \
|
||||||
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
|
--dataset grammar \
|
||||||
|
--structure-type grammar \
|
||||||
|
--request-rate 10 \
|
||||||
|
--num-prompts 1000
|
||||||
|
```
|
||||||
|
|
||||||
|
**Regex-based Generation Benchmark**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||||
|
--backend vllm \
|
||||||
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
|
--dataset regex \
|
||||||
|
--request-rate 10 \
|
||||||
|
--num-prompts 1000
|
||||||
|
```
|
||||||
|
|
||||||
|
**Choice-based Generation Benchmark**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||||
|
--backend vllm \
|
||||||
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
|
--dataset choice \
|
||||||
|
--request-rate 10 \
|
||||||
|
--num-prompts 1000
|
||||||
|
```
|
||||||
|
|
||||||
|
**XGrammar Benchmark Dataset**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||||
|
--backend vllm \
|
||||||
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
|
--dataset xgrammar_bench \
|
||||||
|
--request-rate 10 \
|
||||||
|
--num-prompts 1000
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary><b>📚 Example - Long Document QA Benchmark</b></summary>
|
||||||
|
|
||||||
|
<br/>
|
||||||
|
|
||||||
|
Benchmark the performance of long document question-answering with prefix caching.
|
||||||
|
|
||||||
|
**Basic Long Document QA Test**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
||||||
|
--model meta-llama/Llama-2-7b-chat-hf \
|
||||||
|
--enable-prefix-caching \
|
||||||
|
--num-documents 16 \
|
||||||
|
--document-length 2000 \
|
||||||
|
--output-len 50 \
|
||||||
|
--repeat-count 5
|
||||||
|
```
|
||||||
|
|
||||||
|
**Different Repeat Modes**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Random mode (default) - shuffle prompts randomly
|
||||||
|
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
||||||
|
--model meta-llama/Llama-2-7b-chat-hf \
|
||||||
|
--enable-prefix-caching \
|
||||||
|
--num-documents 8 \
|
||||||
|
--document-length 3000 \
|
||||||
|
--repeat-count 3 \
|
||||||
|
--repeat-mode random
|
||||||
|
|
||||||
|
# Tile mode - repeat entire prompt list in sequence
|
||||||
|
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
||||||
|
--model meta-llama/Llama-2-7b-chat-hf \
|
||||||
|
--enable-prefix-caching \
|
||||||
|
--num-documents 8 \
|
||||||
|
--document-length 3000 \
|
||||||
|
--repeat-count 3 \
|
||||||
|
--repeat-mode tile
|
||||||
|
|
||||||
|
# Interleave mode - repeat each prompt consecutively
|
||||||
|
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
||||||
|
--model meta-llama/Llama-2-7b-chat-hf \
|
||||||
|
--enable-prefix-caching \
|
||||||
|
--num-documents 8 \
|
||||||
|
--document-length 3000 \
|
||||||
|
--repeat-count 3 \
|
||||||
|
--repeat-mode interleave
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary><b>🗂️ Example - Prefix Caching Benchmark</b></summary>
|
||||||
|
|
||||||
|
<br/>
|
||||||
|
|
||||||
|
Benchmark the efficiency of automatic prefix caching.
|
||||||
|
|
||||||
|
**Fixed Prompt with Prefix Caching**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 benchmarks/benchmark_prefix_caching.py \
|
||||||
|
--model meta-llama/Llama-2-7b-chat-hf \
|
||||||
|
--enable-prefix-caching \
|
||||||
|
--num-prompts 1 \
|
||||||
|
--repeat-count 100 \
|
||||||
|
--input-length-range 128:256
|
||||||
|
```
|
||||||
|
|
||||||
|
**ShareGPT Dataset with Prefix Caching**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# download dataset
|
||||||
|
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
|
||||||
|
python3 benchmarks/benchmark_prefix_caching.py \
|
||||||
|
--model meta-llama/Llama-2-7b-chat-hf \
|
||||||
|
--dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||||
|
--enable-prefix-caching \
|
||||||
|
--num-prompts 20 \
|
||||||
|
--repeat-count 5 \
|
||||||
|
--input-length-range 128:256
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary><b>⚡ Example - Request Prioritization Benchmark</b></summary>
|
||||||
|
|
||||||
|
<br/>
|
||||||
|
|
||||||
|
Benchmark the performance of request prioritization in vLLM.
|
||||||
|
|
||||||
|
**Basic Prioritization Test**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 benchmarks/benchmark_prioritization.py \
|
||||||
|
--model meta-llama/Llama-2-7b-chat-hf \
|
||||||
|
--input-len 128 \
|
||||||
|
--output-len 64 \
|
||||||
|
--num-prompts 100 \
|
||||||
|
--scheduling-policy priority
|
||||||
|
```
|
||||||
|
|
||||||
|
**Multiple Sequences per Prompt**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 benchmarks/benchmark_prioritization.py \
|
||||||
|
--model meta-llama/Llama-2-7b-chat-hf \
|
||||||
|
--input-len 128 \
|
||||||
|
--output-len 64 \
|
||||||
|
--num-prompts 100 \
|
||||||
|
--scheduling-policy priority \
|
||||||
|
--n 2
|
||||||
|
```
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|||||||
276
benchmarks/auto_tune.sh
Normal file
276
benchmarks/auto_tune.sh
Normal file
@ -0,0 +1,276 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# This script aims to tune the best server parameter combinations to maximize throughput for given requirement.
|
||||||
|
# The current server parameter combination is max_num_seqs and max_num_batched_tokens
|
||||||
|
# It also supports additional requirement: e2e latency and prefix cache.
|
||||||
|
|
||||||
|
# Pre-requisite:
|
||||||
|
# 1. Checkout to your branch, install/ update the correct running env. For TPU, activate conda env and install the corresponding torch, xla version.
|
||||||
|
# 2. If the model is customized, replace the MODEL's config with the customized config.
|
||||||
|
# 3. Set variables (ALL REQUIRED)
|
||||||
|
# BASE: your directory for vllm repo
|
||||||
|
# MODEL: the model served by vllm
|
||||||
|
# SYSTEM: the hardware, choice TPU or GPU, for other systems, "get best profile" might not support.
|
||||||
|
# TP: ways of tensor parallelism
|
||||||
|
# DOWNLOAD_DIR: directory to download and load model weights.
|
||||||
|
# INPUT_LEN: request input len
|
||||||
|
# OUTPUT_LEN: request output len
|
||||||
|
# MIN_CACHE_HIT_PCT: prefix cache rate
|
||||||
|
# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
|
||||||
|
# NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with.
|
||||||
|
# NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with.
|
||||||
|
# Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST.
|
||||||
|
# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
|
||||||
|
# 5. The final result will be saved in RESULT file.
|
||||||
|
|
||||||
|
|
||||||
|
# Example use cases
|
||||||
|
# 1. Given input_len=1800, output_len=20, what's the best max_num_seqs and max_num_batched_tokens to get highest throughput?
|
||||||
|
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=100000000000
|
||||||
|
# 2. If we have latency requirement to be lower than 500ms, what's the best server parameter?
|
||||||
|
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=0, MAX_LATENCY_ALLOWED_MS=500
|
||||||
|
# 3. If we want to reach 60% prefix cache, what's the best server parameter?
|
||||||
|
# Use INPUT_LEN=1800, OUTPUT_LEN=20, MIN_CACHE_HIT_PCT=60, MAX_LATENCY_ALLOWED_MS=500
|
||||||
|
|
||||||
|
TAG=$(date +"%Y_%m_%d_%H_%M")
|
||||||
|
BASE=""
|
||||||
|
MODEL="meta-llama/Llama-3.1-8B-Instruct"
|
||||||
|
SYSTEM="TPU"
|
||||||
|
TP=1
|
||||||
|
DOWNLOAD_DIR=""
|
||||||
|
INPUT_LEN=4000
|
||||||
|
OUTPUT_LEN=16
|
||||||
|
MIN_CACHE_HIT_PCT=0
|
||||||
|
MAX_LATENCY_ALLOWED_MS=100000000000
|
||||||
|
NUM_SEQS_LIST="128 256"
|
||||||
|
NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
|
||||||
|
|
||||||
|
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
|
||||||
|
RESULT="$LOG_FOLDER/result.txt"
|
||||||
|
PROFILE_PATH="$LOG_FOLDER/profile"
|
||||||
|
|
||||||
|
echo "result file: $RESULT"
|
||||||
|
echo "model: $MODEL"
|
||||||
|
|
||||||
|
rm -rf $LOG_FOLDER
|
||||||
|
rm -rf $PROFILE_PATH
|
||||||
|
mkdir -p $LOG_FOLDER
|
||||||
|
mkdir -p $PROFILE_PATH
|
||||||
|
|
||||||
|
cd "$BASE/vllm"
|
||||||
|
|
||||||
|
pip install -q datasets
|
||||||
|
|
||||||
|
current_hash=$(git rev-parse HEAD)
|
||||||
|
echo "hash:$current_hash" >> "$RESULT"
|
||||||
|
echo "current_hash: $current_hash"
|
||||||
|
|
||||||
|
best_throughput=0
|
||||||
|
best_max_num_seqs=0
|
||||||
|
best_num_batched_tokens=0
|
||||||
|
best_goodput=0
|
||||||
|
|
||||||
|
start_server() {
|
||||||
|
local gpu_memory_utilization=$1
|
||||||
|
local max_num_seqs=$2
|
||||||
|
local max_num_batched_tokens=$3
|
||||||
|
local vllm_log=$4
|
||||||
|
local profile_dir=$5
|
||||||
|
|
||||||
|
pkill -f vllm
|
||||||
|
|
||||||
|
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
|
||||||
|
--disable-log-requests \
|
||||||
|
--port 8004 \
|
||||||
|
--gpu-memory-utilization $gpu_memory_utilization \
|
||||||
|
--max-num-seqs $max_num_seqs \
|
||||||
|
--max-num-batched-tokens $max_num_batched_tokens \
|
||||||
|
--tensor-parallel-size $TP \
|
||||||
|
--enable-prefix-caching \
|
||||||
|
--load-format dummy \
|
||||||
|
--download-dir "$DOWNLOAD_DIR" \
|
||||||
|
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
|
||||||
|
|
||||||
|
# wait for 10 minutes...
|
||||||
|
server_started=0
|
||||||
|
for i in {1..60}; do
|
||||||
|
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
|
||||||
|
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
||||||
|
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
||||||
|
server_started=1
|
||||||
|
break
|
||||||
|
else
|
||||||
|
sleep 10
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
if (( ! server_started )); then
|
||||||
|
echo "server did not start within 10 minutes. Please check server log at $vllm_log".
|
||||||
|
return 1
|
||||||
|
else
|
||||||
|
return 0
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
update_best_profile() {
|
||||||
|
local profile_dir=$1
|
||||||
|
local profile_index=$2
|
||||||
|
sorted_paths=($(find "$profile_dir" -maxdepth 1 -not -path "$profile_dir" | sort))
|
||||||
|
selected_profile_file=
|
||||||
|
if [[ "$SYSTEM" == "TPU" ]]; then
|
||||||
|
selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
|
||||||
|
fi
|
||||||
|
if [[ "$SYSTEM" == "GPU" ]]; then
|
||||||
|
selected_profile_file="${sorted_paths[$profile_index]}"
|
||||||
|
fi
|
||||||
|
rm -f $PROFILE_PATH/*
|
||||||
|
cp $selected_profile_file $PROFILE_PATH
|
||||||
|
}
|
||||||
|
|
||||||
|
run_benchmark() {
|
||||||
|
local max_num_seqs=$1
|
||||||
|
local max_num_batched_tokens=$2
|
||||||
|
local gpu_memory_utilization=$3
|
||||||
|
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
||||||
|
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
|
||||||
|
local profile_dir="$LOG_FOLDER/profile_${max_num_seqs}_${max_num_batched_tokens}"
|
||||||
|
echo "vllm_log: $vllm_log"
|
||||||
|
echo
|
||||||
|
rm -f $vllm_log
|
||||||
|
mkdir -p $profile_dir
|
||||||
|
pkill -f vllm
|
||||||
|
local profile_index=0
|
||||||
|
|
||||||
|
echo "starting server..."
|
||||||
|
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log $profile_dir
|
||||||
|
result=$?
|
||||||
|
if [[ "$result" -eq 1 ]]; then
|
||||||
|
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
||||||
|
else
|
||||||
|
echo "server started."
|
||||||
|
fi
|
||||||
|
echo
|
||||||
|
|
||||||
|
echo "run benchmark test..."
|
||||||
|
meet_latency_requirement=0
|
||||||
|
# get a basic qps by using request-rate inf
|
||||||
|
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
|
||||||
|
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
|
||||||
|
python benchmarks/benchmark_serving.py \
|
||||||
|
--backend vllm \
|
||||||
|
--model $MODEL \
|
||||||
|
--dataset-name random \
|
||||||
|
--random-input-len $INPUT_LEN \
|
||||||
|
--random-output-len $OUTPUT_LEN \
|
||||||
|
--ignore-eos \
|
||||||
|
--disable-tqdm \
|
||||||
|
--request-rate inf \
|
||||||
|
--percentile-metrics ttft,tpot,itl,e2el \
|
||||||
|
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||||
|
--num-prompts 1000 \
|
||||||
|
--random-prefix-len $prefix_len \
|
||||||
|
--port 8004 \
|
||||||
|
--profile &> "$bm_log"
|
||||||
|
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||||
|
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
||||||
|
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||||
|
|
||||||
|
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
|
||||||
|
meet_latency_requirement=1
|
||||||
|
request_rate=inf
|
||||||
|
fi
|
||||||
|
|
||||||
|
if (( ! meet_latency_requirement )); then
|
||||||
|
# start from request-rate as int(throughput) + 1
|
||||||
|
request_rate=$((${throughput%.*} + 1))
|
||||||
|
while ((request_rate > 0)); do
|
||||||
|
profile_index=$((profile_index+1))
|
||||||
|
# clear prefix cache
|
||||||
|
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
||||||
|
sleep 5
|
||||||
|
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_${request_rate}.txt"
|
||||||
|
python benchmarks/benchmark_serving.py \
|
||||||
|
--backend vllm \
|
||||||
|
--model $MODEL \
|
||||||
|
--dataset-name random \
|
||||||
|
--random-input-len $INPUT_LEN \
|
||||||
|
--random-output-len $OUTPUT_LEN \
|
||||||
|
--ignore-eos \
|
||||||
|
--disable-tqdm \
|
||||||
|
--request-rate $request_rate \
|
||||||
|
--percentile-metrics ttft,tpot,itl,e2el \
|
||||||
|
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||||
|
--num-prompts 100 \
|
||||||
|
--random-prefix-len $prefix_len \
|
||||||
|
--port 8004 &> "$bm_log"
|
||||||
|
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||||
|
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
||||||
|
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||||
|
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
|
||||||
|
meet_latency_requirement=1
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
request_rate=$((request_rate-1))
|
||||||
|
done
|
||||||
|
fi
|
||||||
|
# write the results and update the best result.
|
||||||
|
if ((meet_latency_requirement)); then
|
||||||
|
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput"
|
||||||
|
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" >> "$RESULT"
|
||||||
|
if (( $(echo "$throughput > $best_throughput" | bc -l) )); then
|
||||||
|
best_throughput=$throughput
|
||||||
|
best_max_num_seqs=$max_num_seqs
|
||||||
|
best_num_batched_tokens=$max_num_batched_tokens
|
||||||
|
best_goodput=$goodput
|
||||||
|
if [[ "$SYSTEM" == "TPU" ]]; then
|
||||||
|
update_best_profile "$profile_dir/plugins/profile" $profile_index
|
||||||
|
fi
|
||||||
|
if [[ "$SYSTEM" == "GPU" ]]; then
|
||||||
|
update_best_profile "$profile_dir" $profile_index
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
|
||||||
|
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}" >> "$RESULT"
|
||||||
|
fi
|
||||||
|
|
||||||
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
||||||
|
|
||||||
|
pkill vllm
|
||||||
|
sleep 10
|
||||||
|
printf '=%.0s' $(seq 1 20)
|
||||||
|
return 0
|
||||||
|
}
|
||||||
|
|
||||||
|
read -r -a num_seqs_list <<< "$NUM_SEQS_LIST"
|
||||||
|
read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"
|
||||||
|
|
||||||
|
# first find out the max gpu-memory-utilization without HBM OOM.
|
||||||
|
gpu_memory_utilization=0.98
|
||||||
|
find_gpu_memory_utilization=0
|
||||||
|
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
|
||||||
|
start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log"
|
||||||
|
result=$?
|
||||||
|
if [[ "$result" -eq 0 ]]; then
|
||||||
|
find_gpu_memory_utilization=1
|
||||||
|
break
|
||||||
|
else
|
||||||
|
gpu_memory_utilization=$(echo "$gpu_memory_utilization - 0.01" | bc)
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
if [[ "$find_gpu_memory_utilization" -eq 1 ]]; then
|
||||||
|
echo "Using gpu_memory_utilization=$gpu_memory_utilization to serve model."
|
||||||
|
else
|
||||||
|
echo "Cannot find a proper gpu_memory_utilization over 0.9 to serve the model, please check logs in $LOG_FOLDER."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
for num_seqs in "${num_seqs_list[@]}"; do
|
||||||
|
for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
|
||||||
|
run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization
|
||||||
|
done
|
||||||
|
done
|
||||||
|
echo "finish permutations"
|
||||||
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
|
||||||
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
|
||||||
|
|
||||||
@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
@ -12,8 +13,7 @@ from typing import Optional, Union
|
|||||||
import aiohttp
|
import aiohttp
|
||||||
import huggingface_hub.constants
|
import huggingface_hub.constants
|
||||||
from tqdm.asyncio import tqdm
|
from tqdm.asyncio import tqdm
|
||||||
from transformers import (AutoTokenizer, PreTrainedTokenizer,
|
from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
|
||||||
PreTrainedTokenizerFast)
|
|
||||||
|
|
||||||
# NOTE(simon): do not import vLLM here so the benchmark script
|
# NOTE(simon): do not import vLLM here so the benchmark script
|
||||||
# can run without vLLM installed.
|
# can run without vLLM installed.
|
||||||
@ -43,8 +43,7 @@ class RequestFuncOutput:
|
|||||||
latency: float = 0.0
|
latency: float = 0.0
|
||||||
output_tokens: int = 0
|
output_tokens: int = 0
|
||||||
ttft: float = 0.0 # Time to first token
|
ttft: float = 0.0 # Time to first token
|
||||||
itl: list[float] = field(
|
itl: list[float] = field(default_factory=list) # list of inter-token latencies
|
||||||
default_factory=list) # list of inter-token latencies
|
|
||||||
tpot: float = 0.0 # avg next-token latencies
|
tpot: float = 0.0 # avg next-token latencies
|
||||||
prompt_len: int = 0
|
prompt_len: int = 0
|
||||||
error: str = ""
|
error: str = ""
|
||||||
@ -57,8 +56,9 @@ async def async_request_tgi(
|
|||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith("generate_stream")
|
assert api_url.endswith("generate_stream")
|
||||||
|
|
||||||
async with aiohttp.ClientSession(trust_env=True,
|
async with aiohttp.ClientSession(
|
||||||
timeout=AIOHTTP_TIMEOUT) as session:
|
trust_env=True, timeout=AIOHTTP_TIMEOUT
|
||||||
|
) as session:
|
||||||
params = {
|
params = {
|
||||||
"max_new_tokens": request_func_input.output_len,
|
"max_new_tokens": request_func_input.output_len,
|
||||||
"do_sample": True,
|
"do_sample": True,
|
||||||
@ -105,8 +105,7 @@ async def async_request_tgi(
|
|||||||
|
|
||||||
# Decoding phase
|
# Decoding phase
|
||||||
else:
|
else:
|
||||||
output.itl.append(timestamp -
|
output.itl.append(timestamp - most_recent_timestamp)
|
||||||
most_recent_timestamp)
|
|
||||||
|
|
||||||
most_recent_timestamp = timestamp
|
most_recent_timestamp = timestamp
|
||||||
|
|
||||||
@ -133,8 +132,9 @@ async def async_request_trt_llm(
|
|||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith("generate_stream")
|
assert api_url.endswith("generate_stream")
|
||||||
|
|
||||||
async with aiohttp.ClientSession(trust_env=True,
|
async with aiohttp.ClientSession(
|
||||||
timeout=AIOHTTP_TIMEOUT) as session:
|
trust_env=True, timeout=AIOHTTP_TIMEOUT
|
||||||
|
) as session:
|
||||||
payload = {
|
payload = {
|
||||||
"accumulate_tokens": True,
|
"accumulate_tokens": True,
|
||||||
"text_input": request_func_input.prompt,
|
"text_input": request_func_input.prompt,
|
||||||
@ -159,8 +159,7 @@ async def async_request_trt_llm(
|
|||||||
if not chunk_bytes:
|
if not chunk_bytes:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
chunk = chunk_bytes.decode("utf-8").removeprefix("data:")
|
||||||
"data:")
|
|
||||||
|
|
||||||
data = json.loads(chunk)
|
data = json.loads(chunk)
|
||||||
output.generated_text += data["text_output"]
|
output.generated_text += data["text_output"]
|
||||||
@ -172,8 +171,7 @@ async def async_request_trt_llm(
|
|||||||
|
|
||||||
# Decoding phase
|
# Decoding phase
|
||||||
else:
|
else:
|
||||||
output.itl.append(timestamp -
|
output.itl.append(timestamp - most_recent_timestamp)
|
||||||
most_recent_timestamp)
|
|
||||||
|
|
||||||
most_recent_timestamp = timestamp
|
most_recent_timestamp = timestamp
|
||||||
|
|
||||||
@ -197,15 +195,23 @@ async def async_request_deepspeed_mii(
|
|||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
async with aiohttp.ClientSession(trust_env=True,
|
api_url = request_func_input.api_url
|
||||||
timeout=AIOHTTP_TIMEOUT) as session:
|
assert api_url.endswith(("completions", "profile")), (
|
||||||
|
"OpenAI Completions API URL must end with 'completions' or 'profile'."
|
||||||
|
)
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession(
|
||||||
|
trust_env=True, timeout=AIOHTTP_TIMEOUT
|
||||||
|
) as session:
|
||||||
payload = {
|
payload = {
|
||||||
|
"model": request_func_input.model,
|
||||||
"prompt": request_func_input.prompt,
|
"prompt": request_func_input.prompt,
|
||||||
"max_tokens": request_func_input.output_len,
|
"max_tokens": request_func_input.output_len,
|
||||||
"temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
|
"temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
|
||||||
"top_p": 1.0,
|
"top_p": 1.0,
|
||||||
}
|
}
|
||||||
|
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
||||||
|
|
||||||
output = RequestFuncOutput()
|
output = RequestFuncOutput()
|
||||||
output.prompt_len = request_func_input.prompt_len
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
@ -216,19 +222,21 @@ async def async_request_deepspeed_mii(
|
|||||||
|
|
||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
try:
|
try:
|
||||||
async with session.post(url=request_func_input.api_url,
|
async with session.post(
|
||||||
json=payload) as response:
|
url=api_url, json=payload, headers=headers
|
||||||
|
) as response:
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
parsed_resp = await response.json()
|
parsed_resp = await response.json()
|
||||||
output.latency = time.perf_counter() - st
|
output.latency = time.perf_counter() - st
|
||||||
if "choices" in parsed_resp:
|
if "choices" in parsed_resp:
|
||||||
output.generated_text = parsed_resp["choices"][0][
|
output.generated_text = parsed_resp["choices"][0]["text"]
|
||||||
"text"]
|
|
||||||
elif "text" in parsed_resp:
|
elif "text" in parsed_resp:
|
||||||
output.generated_text = parsed_resp["text"][0]
|
output.generated_text = parsed_resp["text"][0]
|
||||||
else:
|
else:
|
||||||
output.error = ("Unexpected response format: "
|
output.error = (
|
||||||
"neither 'choices' nor 'text' found")
|
"Unexpected response format: "
|
||||||
|
"neither 'choices' nor 'text' found"
|
||||||
|
)
|
||||||
output.success = False
|
output.success = False
|
||||||
output.success = True
|
output.success = True
|
||||||
else:
|
else:
|
||||||
@ -249,17 +257,20 @@ async def async_request_openai_completions(
|
|||||||
pbar: Optional[tqdm] = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(
|
assert api_url.endswith(("completions", "profile")), (
|
||||||
("completions", "profile")
|
"OpenAI Completions API URL must end with 'completions' or 'profile'."
|
||||||
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
|
)
|
||||||
|
|
||||||
async with aiohttp.ClientSession(trust_env=True,
|
async with aiohttp.ClientSession(
|
||||||
timeout=AIOHTTP_TIMEOUT) as session:
|
trust_env=True, timeout=AIOHTTP_TIMEOUT
|
||||||
|
) as session:
|
||||||
payload = {
|
payload = {
|
||||||
"model": request_func_input.model_name \
|
"model": request_func_input.model_name
|
||||||
if request_func_input.model_name else request_func_input.model,
|
if request_func_input.model_name
|
||||||
|
else request_func_input.model,
|
||||||
"prompt": request_func_input.prompt,
|
"prompt": request_func_input.prompt,
|
||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
|
"repetition_penalty": 1.0,
|
||||||
"max_tokens": request_func_input.output_len,
|
"max_tokens": request_func_input.output_len,
|
||||||
"logprobs": request_func_input.logprobs,
|
"logprobs": request_func_input.logprobs,
|
||||||
"stream": True,
|
"stream": True,
|
||||||
@ -271,9 +282,7 @@ async def async_request_openai_completions(
|
|||||||
payload["ignore_eos"] = request_func_input.ignore_eos
|
payload["ignore_eos"] = request_func_input.ignore_eos
|
||||||
if request_func_input.extra_body:
|
if request_func_input.extra_body:
|
||||||
payload.update(request_func_input.extra_body)
|
payload.update(request_func_input.extra_body)
|
||||||
headers = {
|
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
||||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
|
|
||||||
}
|
|
||||||
|
|
||||||
output = RequestFuncOutput()
|
output = RequestFuncOutput()
|
||||||
output.prompt_len = request_func_input.prompt_len
|
output.prompt_len = request_func_input.prompt_len
|
||||||
@ -282,8 +291,9 @@ async def async_request_openai_completions(
|
|||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
most_recent_timestamp = st
|
most_recent_timestamp = st
|
||||||
try:
|
try:
|
||||||
async with session.post(url=api_url, json=payload,
|
async with session.post(
|
||||||
headers=headers) as response:
|
url=api_url, json=payload, headers=headers
|
||||||
|
) as response:
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
first_chunk_received = False
|
first_chunk_received = False
|
||||||
async for chunk_bytes in response.content:
|
async for chunk_bytes in response.content:
|
||||||
@ -291,8 +301,7 @@ async def async_request_openai_completions(
|
|||||||
if not chunk_bytes:
|
if not chunk_bytes:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
|
||||||
"data: ")
|
|
||||||
if chunk != "[DONE]":
|
if chunk != "[DONE]":
|
||||||
data = json.loads(chunk)
|
data = json.loads(chunk)
|
||||||
|
|
||||||
@ -312,21 +321,20 @@ async def async_request_openai_completions(
|
|||||||
|
|
||||||
# Decoding phase
|
# Decoding phase
|
||||||
else:
|
else:
|
||||||
output.itl.append(timestamp -
|
output.itl.append(timestamp - most_recent_timestamp)
|
||||||
most_recent_timestamp)
|
|
||||||
|
|
||||||
most_recent_timestamp = timestamp
|
most_recent_timestamp = timestamp
|
||||||
generated_text += text or ""
|
generated_text += text or ""
|
||||||
elif usage := data.get("usage"):
|
if usage := data.get("usage"):
|
||||||
output.output_tokens = usage.get(
|
output.output_tokens = usage.get("completion_tokens")
|
||||||
"completion_tokens")
|
|
||||||
if first_chunk_received:
|
if first_chunk_received:
|
||||||
output.success = True
|
output.success = True
|
||||||
else:
|
else:
|
||||||
output.success = False
|
output.success = False
|
||||||
output.error = (
|
output.error = (
|
||||||
"Never received a valid chunk to calculate TTFT."
|
"Never received a valid chunk to calculate TTFT."
|
||||||
"This response will be marked as failed!")
|
"This response will be marked as failed!"
|
||||||
|
)
|
||||||
output.generated_text = generated_text
|
output.generated_text = generated_text
|
||||||
output.latency = most_recent_timestamp - st
|
output.latency = most_recent_timestamp - st
|
||||||
else:
|
else:
|
||||||
@ -347,23 +355,22 @@ async def async_request_openai_chat_completions(
|
|||||||
pbar: Optional[tqdm] = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(
|
assert api_url.endswith(("chat/completions", "profile")), (
|
||||||
("chat/completions", "profile")
|
"OpenAI Chat Completions API URL must end with 'chat/completions'."
|
||||||
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
|
)
|
||||||
|
|
||||||
async with aiohttp.ClientSession(trust_env=True,
|
async with aiohttp.ClientSession(
|
||||||
timeout=AIOHTTP_TIMEOUT) as session:
|
trust_env=True, timeout=AIOHTTP_TIMEOUT
|
||||||
|
) as session:
|
||||||
content = [{"type": "text", "text": request_func_input.prompt}]
|
content = [{"type": "text", "text": request_func_input.prompt}]
|
||||||
if request_func_input.multi_modal_content:
|
if request_func_input.multi_modal_content:
|
||||||
content.append(request_func_input.multi_modal_content)
|
content.append(request_func_input.multi_modal_content)
|
||||||
payload = {
|
payload = {
|
||||||
"model": request_func_input.model_name \
|
"model": request_func_input.model_name
|
||||||
if request_func_input.model_name else request_func_input.model,
|
if request_func_input.model_name
|
||||||
|
else request_func_input.model,
|
||||||
"messages": [
|
"messages": [
|
||||||
{
|
{"role": "user", "content": content},
|
||||||
"role": "user",
|
|
||||||
"content": content
|
|
||||||
},
|
|
||||||
],
|
],
|
||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
"max_completion_tokens": request_func_input.output_len,
|
"max_completion_tokens": request_func_input.output_len,
|
||||||
@ -389,16 +396,22 @@ async def async_request_openai_chat_completions(
|
|||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
most_recent_timestamp = st
|
most_recent_timestamp = st
|
||||||
try:
|
try:
|
||||||
async with session.post(url=api_url, json=payload,
|
async with session.post(
|
||||||
headers=headers) as response:
|
url=api_url, json=payload, headers=headers
|
||||||
|
) as response:
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
async for chunk_bytes in response.content:
|
async for chunk_bytes in response.content:
|
||||||
chunk_bytes = chunk_bytes.strip()
|
chunk_bytes = chunk_bytes.strip()
|
||||||
if not chunk_bytes:
|
if not chunk_bytes:
|
||||||
continue
|
continue
|
||||||
|
chunk_bytes = chunk_bytes.decode("utf-8")
|
||||||
|
# NOTE: SSE comments (often used as pings) start with a colon.
|
||||||
|
# These are not JSON data payload and should be skipped.
|
||||||
|
if chunk_bytes.startswith(":"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
chunk = chunk_bytes.removeprefix("data: ")
|
||||||
|
|
||||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
|
||||||
"data: ")
|
|
||||||
if chunk != "[DONE]":
|
if chunk != "[DONE]":
|
||||||
timestamp = time.perf_counter()
|
timestamp = time.perf_counter()
|
||||||
data = json.loads(chunk)
|
data = json.loads(chunk)
|
||||||
@ -412,13 +425,11 @@ async def async_request_openai_chat_completions(
|
|||||||
|
|
||||||
# Decoding phase
|
# Decoding phase
|
||||||
else:
|
else:
|
||||||
output.itl.append(timestamp -
|
output.itl.append(timestamp - most_recent_timestamp)
|
||||||
most_recent_timestamp)
|
|
||||||
|
|
||||||
generated_text += content or ""
|
generated_text += content or ""
|
||||||
elif usage := data.get("usage"):
|
elif usage := data.get("usage"):
|
||||||
output.output_tokens = usage.get(
|
output.output_tokens = usage.get("completion_tokens")
|
||||||
"completion_tokens")
|
|
||||||
|
|
||||||
most_recent_timestamp = timestamp
|
most_recent_timestamp = timestamp
|
||||||
|
|
||||||
@ -444,25 +455,28 @@ async def async_request_openai_audio(
|
|||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
# Lazy import without PlaceholderModule to avoid vllm dep.
|
# Lazy import without PlaceholderModule to avoid vllm dep.
|
||||||
import soundfile
|
import soundfile
|
||||||
|
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(
|
assert api_url.endswith(("transcriptions", "translations")), (
|
||||||
("transcriptions", "translations"
|
"OpenAI Chat Completions API URL must end with 'transcriptions' "
|
||||||
)), "OpenAI Chat Completions API URL must end with 'transcriptions' "
|
)
|
||||||
"or `translations`."
|
"or `translations`."
|
||||||
|
|
||||||
async with aiohttp.ClientSession(trust_env=True,
|
async with aiohttp.ClientSession(
|
||||||
timeout=AIOHTTP_TIMEOUT) as session:
|
trust_env=True, timeout=AIOHTTP_TIMEOUT
|
||||||
|
) as session:
|
||||||
content = [{"type": "text", "text": request_func_input.prompt}]
|
content = [{"type": "text", "text": request_func_input.prompt}]
|
||||||
payload = {
|
payload = {
|
||||||
"model": request_func_input.model_name \
|
"model": request_func_input.model_name
|
||||||
if request_func_input.model_name else request_func_input.model,
|
if request_func_input.model_name
|
||||||
|
else request_func_input.model,
|
||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
"max_completion_tokens": request_func_input.output_len,
|
"max_completion_tokens": request_func_input.output_len,
|
||||||
"stream": True,
|
"stream": True,
|
||||||
"language": "en",
|
"language": "en",
|
||||||
# Flattened due to multipart/form-data
|
# Flattened due to multipart/form-data
|
||||||
"stream_include_usage": True,
|
"stream_include_usage": True,
|
||||||
"stream_continuous_usage_stats": True
|
"stream_continuous_usage_stats": True,
|
||||||
}
|
}
|
||||||
if request_func_input.extra_body:
|
if request_func_input.extra_body:
|
||||||
payload.update(request_func_input.extra_body)
|
payload.update(request_func_input.extra_body)
|
||||||
@ -477,9 +491,9 @@ async def async_request_openai_audio(
|
|||||||
buffer.seek(0)
|
buffer.seek(0)
|
||||||
return buffer
|
return buffer
|
||||||
|
|
||||||
with to_bytes(*request_func_input.multi_modal_content['audio']) as f:
|
with to_bytes(*request_func_input.multi_modal_content["audio"]) as f:
|
||||||
form = aiohttp.FormData()
|
form = aiohttp.FormData()
|
||||||
form.add_field('file', f, content_type='audio/wav')
|
form.add_field("file", f, content_type="audio/wav")
|
||||||
for key, value in payload.items():
|
for key, value in payload.items():
|
||||||
form.add_field(key, str(value))
|
form.add_field(key, str(value))
|
||||||
|
|
||||||
@ -491,24 +505,22 @@ async def async_request_openai_audio(
|
|||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
most_recent_timestamp = st
|
most_recent_timestamp = st
|
||||||
try:
|
try:
|
||||||
async with session.post(url=api_url,
|
async with session.post(
|
||||||
data=form,
|
url=api_url, data=form, headers=headers
|
||||||
headers=headers) as response:
|
) as response:
|
||||||
if response.status == 200:
|
if response.status == 200:
|
||||||
async for chunk_bytes in response.content:
|
async for chunk_bytes in response.content:
|
||||||
chunk_bytes = chunk_bytes.strip()
|
chunk_bytes = chunk_bytes.strip()
|
||||||
if not chunk_bytes:
|
if not chunk_bytes:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
|
||||||
"data: ")
|
|
||||||
if chunk != "[DONE]":
|
if chunk != "[DONE]":
|
||||||
timestamp = time.perf_counter()
|
timestamp = time.perf_counter()
|
||||||
data = json.loads(chunk)
|
data = json.loads(chunk)
|
||||||
|
|
||||||
if choices := data.get("choices"):
|
if choices := data.get("choices"):
|
||||||
content = choices[0]["delta"].get(
|
content = choices[0]["delta"].get("content")
|
||||||
"content")
|
|
||||||
# First token
|
# First token
|
||||||
if ttft == 0.0:
|
if ttft == 0.0:
|
||||||
ttft = timestamp - st
|
ttft = timestamp - st
|
||||||
@ -517,12 +529,14 @@ async def async_request_openai_audio(
|
|||||||
# Decoding phase
|
# Decoding phase
|
||||||
else:
|
else:
|
||||||
output.itl.append(
|
output.itl.append(
|
||||||
timestamp - most_recent_timestamp)
|
timestamp - most_recent_timestamp
|
||||||
|
)
|
||||||
|
|
||||||
generated_text += content or ""
|
generated_text += content or ""
|
||||||
elif usage := data.get("usage"):
|
elif usage := data.get("usage"):
|
||||||
output.output_tokens = usage.get(
|
output.output_tokens = usage.get(
|
||||||
"completion_tokens")
|
"completion_tokens"
|
||||||
|
)
|
||||||
|
|
||||||
most_recent_timestamp = timestamp
|
most_recent_timestamp = timestamp
|
||||||
|
|
||||||
@ -543,7 +557,7 @@ async def async_request_openai_audio(
|
|||||||
|
|
||||||
|
|
||||||
def get_model(pretrained_model_name_or_path: str) -> str:
|
def get_model(pretrained_model_name_or_path: str) -> str:
|
||||||
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
|
if os.getenv("VLLM_USE_MODELSCOPE", "False").lower() == "true":
|
||||||
from modelscope import snapshot_download
|
from modelscope import snapshot_download
|
||||||
|
|
||||||
from vllm.model_executor.model_loader.weight_utils import get_lock
|
from vllm.model_executor.model_loader.weight_utils import get_lock
|
||||||
@ -554,7 +568,8 @@ def get_model(pretrained_model_name_or_path: str) -> str:
|
|||||||
model_path = snapshot_download(
|
model_path = snapshot_download(
|
||||||
model_id=pretrained_model_name_or_path,
|
model_id=pretrained_model_name_or_path,
|
||||||
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
||||||
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
|
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"],
|
||||||
|
)
|
||||||
|
|
||||||
return model_path
|
return model_path
|
||||||
return pretrained_model_name_or_path
|
return pretrained_model_name_or_path
|
||||||
@ -567,23 +582,23 @@ def get_tokenizer(
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
||||||
if pretrained_model_name_or_path is not None and not os.path.exists(
|
if pretrained_model_name_or_path is not None and not os.path.exists(
|
||||||
pretrained_model_name_or_path):
|
pretrained_model_name_or_path
|
||||||
pretrained_model_name_or_path = get_model(
|
):
|
||||||
pretrained_model_name_or_path)
|
pretrained_model_name_or_path = get_model(pretrained_model_name_or_path)
|
||||||
if tokenizer_mode == "slow":
|
if tokenizer_mode == "slow":
|
||||||
if kwargs.get("use_fast", False):
|
if kwargs.get("use_fast", False):
|
||||||
raise ValueError(
|
raise ValueError("Cannot use the fast tokenizer in slow tokenizer mode.")
|
||||||
"Cannot use the fast tokenizer in slow tokenizer mode.")
|
|
||||||
kwargs["use_fast"] = False
|
kwargs["use_fast"] = False
|
||||||
if tokenizer_mode == "mistral":
|
if tokenizer_mode == "mistral":
|
||||||
try:
|
try:
|
||||||
from vllm.transformers_utils.tokenizer import MistralTokenizer
|
from vllm.transformers_utils.tokenizer import MistralTokenizer
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
raise ImportError("MistralTokenizer requires vllm package.\n"
|
raise ImportError(
|
||||||
"Please install it with `pip install vllm` "
|
"MistralTokenizer requires vllm package.\n"
|
||||||
"to use mistral tokenizer mode.") from e
|
"Please install it with `pip install vllm` "
|
||||||
return MistralTokenizer.from_pretrained(
|
"to use mistral tokenizer mode."
|
||||||
str(pretrained_model_name_or_path))
|
) from e
|
||||||
|
return MistralTokenizer.from_pretrained(str(pretrained_model_name_or_path))
|
||||||
else:
|
else:
|
||||||
return AutoTokenizer.from_pretrained(
|
return AutoTokenizer.from_pretrained(
|
||||||
pretrained_model_name_or_path,
|
pretrained_model_name_or_path,
|
||||||
@ -603,10 +618,11 @@ ASYNC_REQUEST_FUNCS = {
|
|||||||
"tensorrt-llm": async_request_trt_llm,
|
"tensorrt-llm": async_request_trt_llm,
|
||||||
"scalellm": async_request_openai_completions,
|
"scalellm": async_request_openai_completions,
|
||||||
"sglang": async_request_openai_completions,
|
"sglang": async_request_openai_completions,
|
||||||
|
"llama.cpp": async_request_openai_completions,
|
||||||
}
|
}
|
||||||
|
|
||||||
OPENAI_COMPATIBLE_BACKENDS = [
|
OPENAI_COMPATIBLE_BACKENDS = [
|
||||||
k for k, v in ASYNC_REQUEST_FUNCS.items()
|
k
|
||||||
if v in (async_request_openai_completions,
|
for k, v in ASYNC_REQUEST_FUNCS.items()
|
||||||
async_request_openai_chat_completions)
|
if v in (async_request_openai_completions, async_request_openai_chat_completions)
|
||||||
]
|
]
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""
|
"""
|
||||||
This module defines a framework for sampling benchmark requests from various
|
This module defines a framework for sampling benchmark requests from various
|
||||||
datasets. Each dataset subclass of BenchmarkDataset must implement sample
|
datasets. Each dataset subclass of BenchmarkDataset must implement sample
|
||||||
@ -9,9 +10,6 @@ generation. Supported dataset types include:
|
|||||||
- BurstGPT
|
- BurstGPT
|
||||||
- HuggingFace
|
- HuggingFace
|
||||||
- VisionArena
|
- VisionArena
|
||||||
|
|
||||||
TODO: Implement CustomDataset to parse a JSON file and convert its contents into
|
|
||||||
SampleRequest instances, similar to the approach used in ShareGPT.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
@ -35,6 +33,7 @@ from transformers import PreTrainedTokenizerBase
|
|||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.lora.utils import get_adapter_absolute_path
|
from vllm.lora.utils import get_adapter_absolute_path
|
||||||
from vllm.multimodal import MultiModalDataDict
|
from vllm.multimodal import MultiModalDataDict
|
||||||
|
from vllm.multimodal.image import convert_image_mode
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
|
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@ -82,14 +81,12 @@ class BenchmarkDataset(ABC):
|
|||||||
self.dataset_path = dataset_path
|
self.dataset_path = dataset_path
|
||||||
# Set the random seed, ensuring that a None value is replaced with the
|
# Set the random seed, ensuring that a None value is replaced with the
|
||||||
# default seed.
|
# default seed.
|
||||||
self.random_seed = (random_seed
|
self.random_seed = random_seed if random_seed is not None else self.DEFAULT_SEED
|
||||||
if random_seed is not None else self.DEFAULT_SEED)
|
|
||||||
self.data = None
|
self.data = None
|
||||||
|
|
||||||
def apply_multimodal_chat_transformation(
|
def apply_multimodal_chat_transformation(
|
||||||
self,
|
self, prompt: str, mm_content: Optional[MultiModalDataDict] = None
|
||||||
prompt: str,
|
) -> list[dict]:
|
||||||
mm_content: Optional[MultiModalDataDict] = None) -> list[dict]:
|
|
||||||
"""
|
"""
|
||||||
Transform a prompt and optional multimodal content into a chat format.
|
Transform a prompt and optional multimodal content into a chat format.
|
||||||
This method is used for chat models that expect a specific conversation
|
This method is used for chat models that expect a specific conversation
|
||||||
@ -111,8 +108,7 @@ class BenchmarkDataset(ABC):
|
|||||||
NotImplementedError: If a subclass does not implement this method.
|
NotImplementedError: If a subclass does not implement this method.
|
||||||
"""
|
"""
|
||||||
# TODO (jenniferzhao): add support for downloading data
|
# TODO (jenniferzhao): add support for downloading data
|
||||||
raise NotImplementedError(
|
raise NotImplementedError("load_data must be implemented in subclasses.")
|
||||||
"load_data must be implemented in subclasses.")
|
|
||||||
|
|
||||||
def get_random_lora_request(
|
def get_random_lora_request(
|
||||||
self,
|
self,
|
||||||
@ -158,8 +154,9 @@ class BenchmarkDataset(ABC):
|
|||||||
return lora_request, lora_tokenizer_cache[lora_id] or tokenizer
|
return lora_request, lora_tokenizer_cache[lora_id] or tokenizer
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def sample(self, tokenizer: PreTrainedTokenizerBase,
|
def sample(
|
||||||
num_requests: int) -> list[SampleRequest]:
|
self, tokenizer: PreTrainedTokenizerBase, num_requests: int
|
||||||
|
) -> list[SampleRequest]:
|
||||||
"""
|
"""
|
||||||
Abstract method to generate sample requests from the dataset.
|
Abstract method to generate sample requests from the dataset.
|
||||||
|
|
||||||
@ -177,8 +174,9 @@ class BenchmarkDataset(ABC):
|
|||||||
"""
|
"""
|
||||||
raise NotImplementedError("sample must be implemented in subclasses.")
|
raise NotImplementedError("sample must be implemented in subclasses.")
|
||||||
|
|
||||||
def maybe_oversample_requests(self, requests: list[SampleRequest],
|
def maybe_oversample_requests(
|
||||||
num_requests: int) -> None:
|
self, requests: list[SampleRequest], num_requests: int
|
||||||
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Oversamples the list of requests if its size is less than the desired
|
Oversamples the list of requests if its size is less than the desired
|
||||||
number.
|
number.
|
||||||
@ -189,11 +187,9 @@ class BenchmarkDataset(ABC):
|
|||||||
"""
|
"""
|
||||||
if len(requests) < num_requests:
|
if len(requests) < num_requests:
|
||||||
random.seed(self.random_seed)
|
random.seed(self.random_seed)
|
||||||
additional = random.choices(requests,
|
additional = random.choices(requests, k=num_requests - len(requests))
|
||||||
k=num_requests - len(requests))
|
|
||||||
requests.extend(additional)
|
requests.extend(additional)
|
||||||
logger.info("Oversampled requests to reach %d total samples.",
|
logger.info("Oversampled requests to reach %d total samples.", num_requests)
|
||||||
num_requests)
|
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@ -218,14 +214,14 @@ def is_valid_sequence(
|
|||||||
"""
|
"""
|
||||||
# Check for invalid conditions
|
# Check for invalid conditions
|
||||||
prompt_too_short = prompt_len < min_len
|
prompt_too_short = prompt_len < min_len
|
||||||
output_too_short = (not skip_min_output_len_check) and (output_len
|
output_too_short = (not skip_min_output_len_check) and (output_len < min_len)
|
||||||
< min_len)
|
|
||||||
prompt_too_long = prompt_len > max_prompt_len
|
prompt_too_long = prompt_len > max_prompt_len
|
||||||
combined_too_long = (prompt_len + output_len) > max_total_len
|
combined_too_long = (prompt_len + output_len) > max_total_len
|
||||||
|
|
||||||
# Return True if none of the invalid conditions are met
|
# Return True if none of the invalid conditions are met
|
||||||
return not (prompt_too_short or output_too_short or prompt_too_long
|
return not (
|
||||||
or combined_too_long)
|
prompt_too_short or output_too_short or prompt_too_long or combined_too_long
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@cache
|
@cache
|
||||||
@ -257,28 +253,28 @@ def process_image(image: Any) -> Mapping[str, Any]:
|
|||||||
Raises:
|
Raises:
|
||||||
ValueError: If the input is not a supported type.
|
ValueError: If the input is not a supported type.
|
||||||
"""
|
"""
|
||||||
if isinstance(image, dict) and 'bytes' in image:
|
if isinstance(image, dict) and "bytes" in image:
|
||||||
image = Image.open(BytesIO(image['bytes']))
|
image = Image.open(BytesIO(image["bytes"]))
|
||||||
if isinstance(image, Image.Image):
|
if isinstance(image, Image.Image):
|
||||||
image = image.convert("RGB")
|
image = convert_image_mode(image, "RGB")
|
||||||
with io.BytesIO() as image_data:
|
with io.BytesIO() as image_data:
|
||||||
image.save(image_data, format="JPEG")
|
image.save(image_data, format="JPEG")
|
||||||
image_base64 = base64.b64encode(
|
image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
|
||||||
image_data.getvalue()).decode("utf-8")
|
|
||||||
return {
|
return {
|
||||||
"type": "image_url",
|
"type": "image_url",
|
||||||
"image_url": {
|
"image_url": {"url": f"data:image/jpeg;base64,{image_base64}"},
|
||||||
"url": f"data:image/jpeg;base64,{image_base64}"
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if isinstance(image, str):
|
if isinstance(image, str):
|
||||||
image_url = (image if image.startswith(
|
image_url = (
|
||||||
("http://", "file://")) else f"file://{image}")
|
image if image.startswith(("http://", "file://")) else f"file://{image}"
|
||||||
|
)
|
||||||
return {"type": "image_url", "image_url": {"url": image_url}}
|
return {"type": "image_url", "image_url": {"url": image_url}}
|
||||||
|
|
||||||
raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
|
raise ValueError(
|
||||||
" or str or dictionary with raw image bytes.")
|
f"Invalid image input {image}. Must be a PIL.Image.Image"
|
||||||
|
" or str or dictionary with raw image bytes."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@ -315,42 +311,57 @@ class RandomDataset(BenchmarkDataset):
|
|||||||
)
|
)
|
||||||
|
|
||||||
vocab_size = tokenizer.vocab_size
|
vocab_size = tokenizer.vocab_size
|
||||||
|
num_special_tokens = tokenizer.num_special_tokens_to_add()
|
||||||
|
real_input_len = input_len - num_special_tokens
|
||||||
|
|
||||||
prefix_token_ids = (np.random.randint(
|
prefix_token_ids = (
|
||||||
0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
|
np.random.randint(0, vocab_size, size=prefix_len).tolist()
|
||||||
|
if prefix_len > 0
|
||||||
|
else []
|
||||||
|
)
|
||||||
|
|
||||||
# New sampling logic: [X * (1 - b), X * (1 + b)]
|
# New sampling logic: [X * (1 - b), X * (1 + b)]
|
||||||
input_low = int(input_len * (1 - range_ratio))
|
input_low = int(real_input_len * (1 - range_ratio))
|
||||||
input_high = int(input_len * (1 + range_ratio))
|
input_high = int(real_input_len * (1 + range_ratio))
|
||||||
output_low = int(output_len * (1 - range_ratio))
|
output_low = int(output_len * (1 - range_ratio))
|
||||||
output_high = int(output_len * (1 + range_ratio))
|
output_high = int(output_len * (1 + range_ratio))
|
||||||
|
|
||||||
# Add logging for debugging
|
# Add logging for debugging
|
||||||
logger.info("Sampling input_len from [%s, %s]", input_low, input_high)
|
logger.info("Sampling input_len from [%s, %s]", input_low, input_high)
|
||||||
logger.info("Sampling output_len from [%s, %s]", output_low,
|
logger.info("Sampling output_len from [%s, %s]", output_low, output_high)
|
||||||
output_high)
|
|
||||||
|
|
||||||
input_lens = np.random.randint(input_low,
|
input_lens = np.random.randint(input_low, input_high + 1, size=num_requests)
|
||||||
input_high + 1,
|
output_lens = np.random.randint(output_low, output_high + 1, size=num_requests)
|
||||||
size=num_requests)
|
|
||||||
output_lens = np.random.randint(output_low,
|
|
||||||
output_high + 1,
|
|
||||||
size=num_requests)
|
|
||||||
offsets = np.random.randint(0, vocab_size, size=num_requests)
|
offsets = np.random.randint(0, vocab_size, size=num_requests)
|
||||||
|
|
||||||
requests = []
|
requests = []
|
||||||
for i in range(num_requests):
|
for i in range(num_requests):
|
||||||
inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) %
|
inner_seq = (
|
||||||
vocab_size).tolist()
|
(offsets[i] + i + np.arange(input_lens[i])) % vocab_size
|
||||||
|
).tolist()
|
||||||
token_sequence = prefix_token_ids + inner_seq
|
token_sequence = prefix_token_ids + inner_seq
|
||||||
prompt = tokenizer.decode(token_sequence)
|
prompt = tokenizer.decode(token_sequence)
|
||||||
|
# After decoding the prompt we have to encode and decode it again.
|
||||||
|
# This is done because in some cases N consecutive tokens
|
||||||
|
# give a string tokenized into != N number of tokens.
|
||||||
|
# For example for GPT2Tokenizer:
|
||||||
|
# [6880, 6881] -> ['Ġcalls', 'here'] ->
|
||||||
|
# [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
|
||||||
|
# To avoid uncontrolled change of the prompt length,
|
||||||
|
# the encoded sequence is truncated before being decode again.
|
||||||
total_input_len = prefix_len + int(input_lens[i])
|
total_input_len = prefix_len + int(input_lens[i])
|
||||||
|
re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
|
||||||
|
:total_input_len
|
||||||
|
]
|
||||||
|
prompt = tokenizer.decode(re_encoded_sequence)
|
||||||
|
total_input_len = len(re_encoded_sequence)
|
||||||
requests.append(
|
requests.append(
|
||||||
SampleRequest(
|
SampleRequest(
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
prompt_len=total_input_len,
|
prompt_len=total_input_len,
|
||||||
expected_output_len=int(output_lens[i]),
|
expected_output_len=int(output_lens[i]),
|
||||||
))
|
)
|
||||||
|
)
|
||||||
return requests
|
return requests
|
||||||
|
|
||||||
|
|
||||||
@ -377,7 +388,8 @@ class ShareGPTDataset(BenchmarkDataset):
|
|||||||
self.data = json.load(f)
|
self.data = json.load(f)
|
||||||
# Filter entries with at least two conversation turns.
|
# Filter entries with at least two conversation turns.
|
||||||
self.data = [
|
self.data = [
|
||||||
entry for entry in self.data
|
entry
|
||||||
|
for entry in self.data
|
||||||
if "conversations" in entry and len(entry["conversations"]) >= 2
|
if "conversations" in entry and len(entry["conversations"]) >= 2
|
||||||
]
|
]
|
||||||
random.seed(self.random_seed)
|
random.seed(self.random_seed)
|
||||||
@ -403,31 +415,123 @@ class ShareGPTDataset(BenchmarkDataset):
|
|||||||
)
|
)
|
||||||
|
|
||||||
lora_request, tokenizer = self.get_random_lora_request(
|
lora_request, tokenizer = self.get_random_lora_request(
|
||||||
tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
|
tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path
|
||||||
|
)
|
||||||
prompt_ids = tokenizer(prompt).input_ids
|
prompt_ids = tokenizer(prompt).input_ids
|
||||||
completion_ids = tokenizer(completion).input_ids
|
completion_ids = tokenizer(completion).input_ids
|
||||||
prompt_len = len(prompt_ids)
|
prompt_len = len(prompt_ids)
|
||||||
new_output_len = (len(completion_ids)
|
new_output_len = len(completion_ids) if output_len is None else output_len
|
||||||
if output_len is None else output_len)
|
if not is_valid_sequence(
|
||||||
if not is_valid_sequence(prompt_len,
|
prompt_len,
|
||||||
new_output_len,
|
new_output_len,
|
||||||
skip_min_output_len_check=output_len
|
skip_min_output_len_check=output_len is not None,
|
||||||
is not None):
|
):
|
||||||
continue
|
continue
|
||||||
if enable_multimodal_chat:
|
if enable_multimodal_chat:
|
||||||
prompt = self.apply_multimodal_chat_transformation(
|
prompt = self.apply_multimodal_chat_transformation(prompt, None)
|
||||||
prompt, None)
|
|
||||||
samples.append(
|
samples.append(
|
||||||
SampleRequest(
|
SampleRequest(
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
prompt_len=prompt_len,
|
prompt_len=prompt_len,
|
||||||
expected_output_len=new_output_len,
|
expected_output_len=new_output_len,
|
||||||
lora_request=lora_request,
|
lora_request=lora_request,
|
||||||
))
|
)
|
||||||
|
)
|
||||||
self.maybe_oversample_requests(samples, num_requests)
|
self.maybe_oversample_requests(samples, num_requests)
|
||||||
return samples
|
return samples
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Custom Dataset Implementation
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class CustomDataset(BenchmarkDataset):
|
||||||
|
"""
|
||||||
|
Implements the Custom dataset. Loads data from a JSONL file and generates
|
||||||
|
sample requests based on conversation turns. E.g.,
|
||||||
|
```
|
||||||
|
{"prompt": "What is the capital of India?"}
|
||||||
|
{"prompt": "What is the capital of Iran?"}
|
||||||
|
{"prompt": "What is the capital of China?"}
|
||||||
|
```
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, **kwargs) -> None:
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
self.load_data()
|
||||||
|
|
||||||
|
def load_data(self) -> None:
|
||||||
|
if self.dataset_path is None:
|
||||||
|
raise ValueError("dataset_path must be provided for loading data.")
|
||||||
|
|
||||||
|
# self.data will be a list of dictionaries
|
||||||
|
# e.g., [{"prompt": "What is the capital of India?"}, ...]
|
||||||
|
# This will be the standardized format which load_data()
|
||||||
|
# has to convert into depending on the filetype of dataset_path.
|
||||||
|
# sample() will assume this standardized format of self.data
|
||||||
|
self.data = []
|
||||||
|
|
||||||
|
# Load the JSONL file
|
||||||
|
if self.dataset_path.endswith(".jsonl"):
|
||||||
|
jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True)
|
||||||
|
|
||||||
|
# check if the JSONL file has a 'prompt' column
|
||||||
|
if "prompt" not in jsonl_data.columns:
|
||||||
|
raise ValueError("JSONL file must contain a 'prompt' column.")
|
||||||
|
|
||||||
|
# Convert each row to a dictionary and append to self.data
|
||||||
|
# This will convert the DataFrame to a list of dictionaries
|
||||||
|
# where each dictionary corresponds to a row in the DataFrame.
|
||||||
|
# This is the standardized format we want for self.data
|
||||||
|
for _, row in jsonl_data.iterrows():
|
||||||
|
self.data.append(row.to_dict())
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(
|
||||||
|
"Only JSONL format is supported for CustomDataset."
|
||||||
|
)
|
||||||
|
|
||||||
|
random.seed(self.random_seed)
|
||||||
|
random.shuffle(self.data)
|
||||||
|
|
||||||
|
def sample(
|
||||||
|
self,
|
||||||
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
|
num_requests: int,
|
||||||
|
lora_path: Optional[str] = None,
|
||||||
|
max_loras: Optional[int] = None,
|
||||||
|
output_len: Optional[int] = None,
|
||||||
|
enable_multimodal_chat: bool = False,
|
||||||
|
skip_chat_template: bool = False,
|
||||||
|
**kwargs,
|
||||||
|
) -> list:
|
||||||
|
sampled_requests = []
|
||||||
|
for item in self.data:
|
||||||
|
if len(sampled_requests) >= num_requests:
|
||||||
|
break
|
||||||
|
prompt = item["prompt"]
|
||||||
|
|
||||||
|
# apply template
|
||||||
|
if not skip_chat_template:
|
||||||
|
prompt = tokenizer.apply_chat_template(
|
||||||
|
[{"role": "user", "content": prompt}],
|
||||||
|
add_generation_prompt=True,
|
||||||
|
tokenize=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
prompt_len = len(tokenizer(prompt).input_ids)
|
||||||
|
sampled_requests.append(
|
||||||
|
SampleRequest(
|
||||||
|
prompt=prompt,
|
||||||
|
prompt_len=prompt_len,
|
||||||
|
expected_output_len=output_len,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self.maybe_oversample_requests(sampled_requests, num_requests)
|
||||||
|
|
||||||
|
return sampled_requests
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# Sonnet Dataset Implementation
|
# Sonnet Dataset Implementation
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@ -469,20 +573,20 @@ class SonnetDataset(BenchmarkDataset):
|
|||||||
) -> list:
|
) -> list:
|
||||||
# Calculate average token length for a poem line.
|
# Calculate average token length for a poem line.
|
||||||
tokenized_lines = [tokenizer(line).input_ids for line in self.data]
|
tokenized_lines = [tokenizer(line).input_ids for line in self.data]
|
||||||
avg_len = sum(len(tokens)
|
avg_len = sum(len(tokens) for tokens in tokenized_lines) / len(tokenized_lines)
|
||||||
for tokens in tokenized_lines) / len(tokenized_lines)
|
|
||||||
|
|
||||||
# Build the base prompt.
|
# Build the base prompt.
|
||||||
base_prompt = "Pick as many lines as you can from these poem lines:\n"
|
base_prompt = "Pick as many lines as you can from these poem lines:\n"
|
||||||
base_msg = [{"role": "user", "content": base_prompt}]
|
base_msg = [{"role": "user", "content": base_prompt}]
|
||||||
base_fmt = tokenizer.apply_chat_template(base_msg,
|
base_fmt = tokenizer.apply_chat_template(
|
||||||
add_generation_prompt=True,
|
base_msg, add_generation_prompt=True, tokenize=False
|
||||||
tokenize=False)
|
)
|
||||||
base_offset = len(tokenizer(base_fmt).input_ids)
|
base_offset = len(tokenizer(base_fmt).input_ids)
|
||||||
if input_len <= base_offset:
|
if input_len <= base_offset:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"'input_len' must be higher than the base prompt length "
|
f"'input_len' must be higher than the base prompt length "
|
||||||
f"({base_offset}).")
|
f"({base_offset})."
|
||||||
|
)
|
||||||
|
|
||||||
# Determine how many poem lines to use.
|
# Determine how many poem lines to use.
|
||||||
num_input_lines = round((input_len - base_offset) / avg_len)
|
num_input_lines = round((input_len - base_offset) / avg_len)
|
||||||
@ -491,21 +595,23 @@ class SonnetDataset(BenchmarkDataset):
|
|||||||
|
|
||||||
samples = []
|
samples = []
|
||||||
while len(samples) < num_requests:
|
while len(samples) < num_requests:
|
||||||
extra_lines = random.choices(self.data,
|
extra_lines = random.choices(
|
||||||
k=num_input_lines - num_prefix_lines)
|
self.data, k=num_input_lines - num_prefix_lines
|
||||||
|
)
|
||||||
prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
|
prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
|
||||||
msg = [{"role": "user", "content": prompt}]
|
msg = [{"role": "user", "content": prompt}]
|
||||||
prompt_formatted = tokenizer.apply_chat_template(
|
prompt_formatted = tokenizer.apply_chat_template(
|
||||||
msg, add_generation_prompt=True, tokenize=False)
|
msg, add_generation_prompt=True, tokenize=False
|
||||||
|
)
|
||||||
prompt_len = len(tokenizer(prompt_formatted).input_ids)
|
prompt_len = len(tokenizer(prompt_formatted).input_ids)
|
||||||
if prompt_len <= input_len:
|
if prompt_len <= input_len:
|
||||||
samples.append(
|
samples.append(
|
||||||
SampleRequest(
|
SampleRequest(
|
||||||
prompt=prompt_formatted
|
prompt=prompt_formatted if return_prompt_formatted else prompt,
|
||||||
if return_prompt_formatted else prompt,
|
|
||||||
prompt_len=prompt_len,
|
prompt_len=prompt_len,
|
||||||
expected_output_len=output_len,
|
expected_output_len=output_len,
|
||||||
))
|
)
|
||||||
|
)
|
||||||
return samples
|
return samples
|
||||||
|
|
||||||
|
|
||||||
@ -525,7 +631,9 @@ class BurstGPTDataset(BenchmarkDataset):
|
|||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.load_data()
|
self.load_data()
|
||||||
|
|
||||||
def load_data(self, ):
|
def load_data(
|
||||||
|
self,
|
||||||
|
):
|
||||||
if self.dataset_path is None:
|
if self.dataset_path is None:
|
||||||
raise ValueError("dataset_path must be provided for loading data.")
|
raise ValueError("dataset_path must be provided for loading data.")
|
||||||
|
|
||||||
@ -539,8 +647,7 @@ class BurstGPTDataset(BenchmarkDataset):
|
|||||||
|
|
||||||
def _sample_loaded_data(self, num_requests: int) -> list:
|
def _sample_loaded_data(self, num_requests: int) -> list:
|
||||||
if num_requests <= len(self.data):
|
if num_requests <= len(self.data):
|
||||||
data = self.data.sample(n=num_requests,
|
data = self.data.sample(n=num_requests, random_state=self.random_seed)
|
||||||
random_state=self.random_seed)
|
|
||||||
else:
|
else:
|
||||||
data = self.data.sample(
|
data = self.data.sample(
|
||||||
n=num_requests,
|
n=num_requests,
|
||||||
@ -564,7 +671,8 @@ class BurstGPTDataset(BenchmarkDataset):
|
|||||||
input_len = int(data[i][2])
|
input_len = int(data[i][2])
|
||||||
output_len = int(data[i][3])
|
output_len = int(data[i][3])
|
||||||
lora_req, tokenizer = self.get_random_lora_request(
|
lora_req, tokenizer = self.get_random_lora_request(
|
||||||
tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
|
tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path
|
||||||
|
)
|
||||||
vocab_size = tokenizer.vocab_size
|
vocab_size = tokenizer.vocab_size
|
||||||
# Generate a synthetic prompt: a list of token IDs computed as (i +
|
# Generate a synthetic prompt: a list of token IDs computed as (i +
|
||||||
# j) modulo vocab_size.
|
# j) modulo vocab_size.
|
||||||
@ -576,7 +684,8 @@ class BurstGPTDataset(BenchmarkDataset):
|
|||||||
prompt_len=input_len,
|
prompt_len=input_len,
|
||||||
expected_output_len=output_len,
|
expected_output_len=output_len,
|
||||||
lora_request=lora_req,
|
lora_request=lora_req,
|
||||||
))
|
)
|
||||||
|
)
|
||||||
return samples
|
return samples
|
||||||
|
|
||||||
|
|
||||||
@ -592,6 +701,7 @@ class HuggingFaceDataset(BenchmarkDataset):
|
|||||||
self,
|
self,
|
||||||
dataset_path: str,
|
dataset_path: str,
|
||||||
dataset_split: str,
|
dataset_split: str,
|
||||||
|
no_stream: bool = False,
|
||||||
dataset_subset: Optional[str] = None,
|
dataset_subset: Optional[str] = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> None:
|
) -> None:
|
||||||
@ -599,6 +709,7 @@ class HuggingFaceDataset(BenchmarkDataset):
|
|||||||
|
|
||||||
self.dataset_split = dataset_split
|
self.dataset_split = dataset_split
|
||||||
self.dataset_subset = dataset_subset
|
self.dataset_subset = dataset_subset
|
||||||
|
self.load_stream = not no_stream
|
||||||
self.load_data()
|
self.load_data()
|
||||||
|
|
||||||
def load_data(self) -> None:
|
def load_data(self) -> None:
|
||||||
@ -607,7 +718,7 @@ class HuggingFaceDataset(BenchmarkDataset):
|
|||||||
self.dataset_path,
|
self.dataset_path,
|
||||||
name=self.dataset_subset,
|
name=self.dataset_subset,
|
||||||
split=self.dataset_split,
|
split=self.dataset_split,
|
||||||
streaming=True,
|
streaming=self.load_stream,
|
||||||
)
|
)
|
||||||
self.data = self.data.shuffle(seed=self.random_seed)
|
self.data = self.data.shuffle(seed=self.random_seed)
|
||||||
|
|
||||||
@ -619,20 +730,23 @@ class HuggingFaceDataset(BenchmarkDataset):
|
|||||||
|
|
||||||
class ConversationDataset(HuggingFaceDataset):
|
class ConversationDataset(HuggingFaceDataset):
|
||||||
"""Dataset for conversation data with multimodal support."""
|
"""Dataset for conversation data with multimodal support."""
|
||||||
|
|
||||||
SUPPORTED_DATASET_PATHS = {
|
SUPPORTED_DATASET_PATHS = {
|
||||||
'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered'
|
"lmms-lab/LLaVA-OneVision-Data",
|
||||||
|
"Aeala/ShareGPT_Vicuna_unfiltered",
|
||||||
}
|
}
|
||||||
IS_MULTIMODAL = True
|
IS_MULTIMODAL = True
|
||||||
|
|
||||||
def sample(self,
|
def sample(
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
self,
|
||||||
num_requests: int,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
output_len: Optional[int] = None,
|
num_requests: int,
|
||||||
enable_multimodal_chat: bool = False,
|
output_len: Optional[int] = None,
|
||||||
**kwargs) -> list:
|
enable_multimodal_chat: bool = False,
|
||||||
|
**kwargs,
|
||||||
|
) -> list:
|
||||||
# Filter examples with at least 2 conversations
|
# Filter examples with at least 2 conversations
|
||||||
filtered_data = self.data.filter(
|
filtered_data = self.data.filter(lambda x: len(x["conversations"]) >= 2)
|
||||||
lambda x: len(x["conversations"]) >= 2)
|
|
||||||
sampled_requests = []
|
sampled_requests = []
|
||||||
dynamic_output = output_len is None
|
dynamic_output = output_len is None
|
||||||
|
|
||||||
@ -648,24 +762,22 @@ class ConversationDataset(HuggingFaceDataset):
|
|||||||
completion_len = len(completion_ids)
|
completion_len = len(completion_ids)
|
||||||
output_len = completion_len if dynamic_output else output_len
|
output_len = completion_len if dynamic_output else output_len
|
||||||
assert isinstance(output_len, int) and output_len > 0
|
assert isinstance(output_len, int) and output_len > 0
|
||||||
if dynamic_output and not is_valid_sequence(
|
if dynamic_output and not is_valid_sequence(prompt_len, completion_len):
|
||||||
prompt_len, completion_len):
|
|
||||||
continue
|
continue
|
||||||
mm_content = process_image(
|
mm_content = process_image(item["image"]) if "image" in item else None
|
||||||
item["image"]) if "image" in item else None
|
|
||||||
if enable_multimodal_chat:
|
if enable_multimodal_chat:
|
||||||
# Note: when chat is enabled the request prompt_len is no longer
|
# Note: when chat is enabled the request prompt_len is no longer
|
||||||
# accurate and we will be using request output to count the
|
# accurate and we will be using request output to count the
|
||||||
# actual prompt len and output len
|
# actual prompt len and output len
|
||||||
prompt = self.apply_multimodal_chat_transformation(
|
prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
|
||||||
prompt, mm_content)
|
|
||||||
sampled_requests.append(
|
sampled_requests.append(
|
||||||
SampleRequest(
|
SampleRequest(
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
prompt_len=prompt_len,
|
prompt_len=prompt_len,
|
||||||
expected_output_len=output_len,
|
expected_output_len=output_len,
|
||||||
multi_modal_data=mm_content,
|
multi_modal_data=mm_content,
|
||||||
))
|
)
|
||||||
|
)
|
||||||
self.maybe_oversample_requests(sampled_requests, num_requests)
|
self.maybe_oversample_requests(sampled_requests, num_requests)
|
||||||
return sampled_requests
|
return sampled_requests
|
||||||
|
|
||||||
@ -682,10 +794,8 @@ class VisionArenaDataset(HuggingFaceDataset):
|
|||||||
|
|
||||||
DEFAULT_OUTPUT_LEN = 128
|
DEFAULT_OUTPUT_LEN = 128
|
||||||
SUPPORTED_DATASET_PATHS = {
|
SUPPORTED_DATASET_PATHS = {
|
||||||
"lmarena-ai/VisionArena-Chat":
|
"lmarena-ai/VisionArena-Chat": lambda x: x["conversation"][0][0]["content"],
|
||||||
lambda x: x["conversation"][0][0]["content"],
|
"lmarena-ai/vision-arena-bench-v0.1": lambda x: x["turns"][0][0]["content"],
|
||||||
"lmarena-ai/vision-arena-bench-v0.1":
|
|
||||||
lambda x: x["turns"][0][0]["content"]
|
|
||||||
}
|
}
|
||||||
IS_MULTIMODAL = True
|
IS_MULTIMODAL = True
|
||||||
|
|
||||||
@ -697,16 +807,14 @@ class VisionArenaDataset(HuggingFaceDataset):
|
|||||||
enable_multimodal_chat: bool = False,
|
enable_multimodal_chat: bool = False,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> list:
|
) -> list:
|
||||||
output_len = (output_len
|
output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
|
||||||
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
|
|
||||||
sampled_requests = []
|
sampled_requests = []
|
||||||
for item in self.data:
|
for item in self.data:
|
||||||
if len(sampled_requests) >= num_requests:
|
if len(sampled_requests) >= num_requests:
|
||||||
break
|
break
|
||||||
parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
|
parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
|
||||||
if parser_fn is None:
|
if parser_fn is None:
|
||||||
raise ValueError(
|
raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
|
||||||
f"Unsupported dataset path: {self.dataset_path}")
|
|
||||||
prompt = parser_fn(item)
|
prompt = parser_fn(item)
|
||||||
mm_content = process_image(item["images"][0])
|
mm_content = process_image(item["images"][0])
|
||||||
prompt_len = len(tokenizer(prompt).input_ids)
|
prompt_len = len(tokenizer(prompt).input_ids)
|
||||||
@ -714,15 +822,15 @@ class VisionArenaDataset(HuggingFaceDataset):
|
|||||||
# Note: when chat is enabled the request prompt_len is no longer
|
# Note: when chat is enabled the request prompt_len is no longer
|
||||||
# accurate and we will be using request output to count the
|
# accurate and we will be using request output to count the
|
||||||
# actual prompt len
|
# actual prompt len
|
||||||
prompt = self.apply_multimodal_chat_transformation(
|
prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
|
||||||
prompt, mm_content)
|
|
||||||
sampled_requests.append(
|
sampled_requests.append(
|
||||||
SampleRequest(
|
SampleRequest(
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
prompt_len=prompt_len,
|
prompt_len=prompt_len,
|
||||||
expected_output_len=output_len,
|
expected_output_len=output_len,
|
||||||
multi_modal_data=mm_content,
|
multi_modal_data=mm_content,
|
||||||
))
|
)
|
||||||
|
)
|
||||||
self.maybe_oversample_requests(sampled_requests, num_requests)
|
self.maybe_oversample_requests(sampled_requests, num_requests)
|
||||||
return sampled_requests
|
return sampled_requests
|
||||||
|
|
||||||
@ -747,26 +855,91 @@ class InstructCoderDataset(HuggingFaceDataset):
|
|||||||
"likaixin/InstructCoder",
|
"likaixin/InstructCoder",
|
||||||
}
|
}
|
||||||
|
|
||||||
def sample(self,
|
def sample(
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
self,
|
||||||
num_requests: int,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
output_len: Optional[int] = None,
|
num_requests: int,
|
||||||
enable_multimodal_chat: bool = False,
|
output_len: Optional[int] = None,
|
||||||
**kwargs) -> list:
|
enable_multimodal_chat: bool = False,
|
||||||
output_len = (output_len
|
**kwargs,
|
||||||
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
|
) -> list:
|
||||||
|
output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
|
||||||
sampled_requests = []
|
sampled_requests = []
|
||||||
for item in self.data:
|
for item in self.data:
|
||||||
if len(sampled_requests) >= num_requests:
|
if len(sampled_requests) >= num_requests:
|
||||||
break
|
break
|
||||||
prompt = f"{item['instruction']}:\n{item['input']}"
|
prompt = f"{item['input']}\n\n{item['instruction']} Just output \
|
||||||
|
the code, do not include any explanation."
|
||||||
|
|
||||||
|
# apply template
|
||||||
|
prompt = tokenizer.apply_chat_template(
|
||||||
|
[{"role": "user", "content": prompt}],
|
||||||
|
add_generation_prompt=True,
|
||||||
|
tokenize=False,
|
||||||
|
)
|
||||||
prompt_len = len(tokenizer(prompt).input_ids)
|
prompt_len = len(tokenizer(prompt).input_ids)
|
||||||
sampled_requests.append(
|
sampled_requests.append(
|
||||||
SampleRequest(
|
SampleRequest(
|
||||||
prompt=prompt,
|
prompt=prompt,
|
||||||
prompt_len=prompt_len,
|
prompt_len=prompt_len,
|
||||||
expected_output_len=output_len,
|
expected_output_len=output_len,
|
||||||
))
|
)
|
||||||
|
)
|
||||||
|
self.maybe_oversample_requests(sampled_requests, num_requests)
|
||||||
|
return sampled_requests
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# MT-Bench Dataset Implementation
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class MTBenchDataset(HuggingFaceDataset):
|
||||||
|
"""
|
||||||
|
MT-Bench Dataset.
|
||||||
|
https://huggingface.co/datasets/philschmid/mt-bench
|
||||||
|
|
||||||
|
We create a single turn dataset for MT-Bench.
|
||||||
|
This is similar to Spec decoding benchmark setup in vLLM
|
||||||
|
https://github.com/vllm-project/vllm/blob/9d98ab5ec/examples/offline_inference/eagle.py#L14-L18
|
||||||
|
""" # noqa: E501
|
||||||
|
|
||||||
|
DEFAULT_OUTPUT_LEN = 256 # avg len used in SD bench in vLLM
|
||||||
|
SUPPORTED_DATASET_PATHS = {
|
||||||
|
"philschmid/mt-bench",
|
||||||
|
}
|
||||||
|
|
||||||
|
def sample(
|
||||||
|
self,
|
||||||
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
|
num_requests: int,
|
||||||
|
output_len: Optional[int] = None,
|
||||||
|
enable_multimodal_chat: bool = False,
|
||||||
|
**kwargs,
|
||||||
|
) -> list:
|
||||||
|
output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
|
||||||
|
sampled_requests = []
|
||||||
|
|
||||||
|
for item in self.data:
|
||||||
|
if len(sampled_requests) >= num_requests:
|
||||||
|
break
|
||||||
|
prompt = item["turns"][0]
|
||||||
|
|
||||||
|
# apply template
|
||||||
|
prompt = tokenizer.apply_chat_template(
|
||||||
|
[{"role": "user", "content": prompt}],
|
||||||
|
add_generation_prompt=True,
|
||||||
|
tokenize=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
prompt_len = len(tokenizer(prompt).input_ids)
|
||||||
|
sampled_requests.append(
|
||||||
|
SampleRequest(
|
||||||
|
prompt=prompt,
|
||||||
|
prompt_len=prompt_len,
|
||||||
|
expected_output_len=output_len,
|
||||||
|
)
|
||||||
|
)
|
||||||
self.maybe_oversample_requests(sampled_requests, num_requests)
|
self.maybe_oversample_requests(sampled_requests, num_requests)
|
||||||
return sampled_requests
|
return sampled_requests
|
||||||
|
|
||||||
@ -780,23 +953,27 @@ class AIMODataset(HuggingFaceDataset):
|
|||||||
"""
|
"""
|
||||||
Dataset class for processing a AIMO dataset with reasoning questions.
|
Dataset class for processing a AIMO dataset with reasoning questions.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
SUPPORTED_DATASET_PATHS = {
|
SUPPORTED_DATASET_PATHS = {
|
||||||
"AI-MO/aimo-validation-aime", "AI-MO/NuminaMath-1.5",
|
"AI-MO/aimo-validation-aime",
|
||||||
"AI-MO/NuminaMath-CoT"
|
"AI-MO/NuminaMath-1.5",
|
||||||
|
"AI-MO/NuminaMath-CoT",
|
||||||
}
|
}
|
||||||
|
|
||||||
def sample(self,
|
def sample(
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
self,
|
||||||
num_requests: int,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
output_len: Optional[int] = None,
|
num_requests: int,
|
||||||
**kwargs) -> list:
|
output_len: Optional[int] = None,
|
||||||
|
**kwargs,
|
||||||
|
) -> list:
|
||||||
sampled_requests = []
|
sampled_requests = []
|
||||||
dynamic_output = output_len is None
|
dynamic_output = output_len is None
|
||||||
|
|
||||||
for item in self.data:
|
for item in self.data:
|
||||||
if len(sampled_requests) >= num_requests:
|
if len(sampled_requests) >= num_requests:
|
||||||
break
|
break
|
||||||
prompt, completion = item['problem'], item["solution"]
|
prompt, completion = item["problem"], item["solution"]
|
||||||
|
|
||||||
prompt_ids = tokenizer(prompt).input_ids
|
prompt_ids = tokenizer(prompt).input_ids
|
||||||
completion_ids = tokenizer(completion).input_ids
|
completion_ids = tokenizer(completion).input_ids
|
||||||
@ -804,10 +981,9 @@ class AIMODataset(HuggingFaceDataset):
|
|||||||
completion_len = len(completion_ids)
|
completion_len = len(completion_ids)
|
||||||
output_len = completion_len if dynamic_output else output_len
|
output_len = completion_len if dynamic_output else output_len
|
||||||
assert isinstance(output_len, int) and output_len > 0
|
assert isinstance(output_len, int) and output_len > 0
|
||||||
if dynamic_output and not is_valid_sequence(prompt_len,
|
if dynamic_output and not is_valid_sequence(
|
||||||
completion_len,
|
prompt_len, completion_len, max_prompt_len=2048, max_total_len=32000
|
||||||
max_prompt_len=2048,
|
):
|
||||||
max_total_len=32000):
|
|
||||||
continue
|
continue
|
||||||
sampled_requests.append(
|
sampled_requests.append(
|
||||||
SampleRequest(
|
SampleRequest(
|
||||||
@ -815,11 +991,100 @@ class AIMODataset(HuggingFaceDataset):
|
|||||||
prompt_len=prompt_len,
|
prompt_len=prompt_len,
|
||||||
expected_output_len=output_len,
|
expected_output_len=output_len,
|
||||||
multi_modal_data=None,
|
multi_modal_data=None,
|
||||||
))
|
)
|
||||||
|
)
|
||||||
self.maybe_oversample_requests(sampled_requests, num_requests)
|
self.maybe_oversample_requests(sampled_requests, num_requests)
|
||||||
return sampled_requests
|
return sampled_requests
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Next Edit Prediction Dataset Implementation
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
zeta_prompt = """### Instruction:
|
||||||
|
You are a code completion assistant and your task is to analyze user edits and then rewrite an excerpt that the user provides, suggesting the appropriate edits within the excerpt, taking into account the cursor location.
|
||||||
|
|
||||||
|
### User Edits:
|
||||||
|
|
||||||
|
{}
|
||||||
|
|
||||||
|
### User Excerpt:
|
||||||
|
|
||||||
|
{}
|
||||||
|
|
||||||
|
### Response:
|
||||||
|
|
||||||
|
""" # noqa: E501
|
||||||
|
|
||||||
|
|
||||||
|
def _format_zeta_prompt(
|
||||||
|
sample: dict, original_start_marker: str = "<|editable_region_start|>"
|
||||||
|
) -> dict:
|
||||||
|
"""Format the zeta prompt for the Next Edit Prediction (NEP) dataset.
|
||||||
|
|
||||||
|
This function formats examples from the NEP dataset
|
||||||
|
into prompts and expected outputs. It could be
|
||||||
|
further extended to support more NEP datasets.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
sample: The dataset sample containing events,
|
||||||
|
inputs, and outputs.
|
||||||
|
original_start_marker: The marker indicating the
|
||||||
|
start of the editable region. Defaults to
|
||||||
|
"<|editable_region_start|>".
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A dictionary with the formatted prompts and expected outputs.
|
||||||
|
"""
|
||||||
|
events = sample["events"]
|
||||||
|
input = sample["input"]
|
||||||
|
output = sample["output"]
|
||||||
|
prompt = zeta_prompt.format(events, input)
|
||||||
|
|
||||||
|
# following the original implementation, extract the focused region
|
||||||
|
# from the raw output
|
||||||
|
output_start_index = output.find(original_start_marker)
|
||||||
|
output_focused_region = output[output_start_index:]
|
||||||
|
expected_output = output_focused_region
|
||||||
|
|
||||||
|
return {"prompt": prompt, "expected_output": expected_output}
|
||||||
|
|
||||||
|
|
||||||
|
class NextEditPredictionDataset(HuggingFaceDataset):
|
||||||
|
"""
|
||||||
|
Dataset class for processing a Next Edit Prediction dataset.
|
||||||
|
"""
|
||||||
|
|
||||||
|
SUPPORTED_DATASET_PATHS = {
|
||||||
|
"zed-industries/zeta",
|
||||||
|
}
|
||||||
|
MAPPING_PROMPT_FUNCS = {
|
||||||
|
"zed-industries/zeta": _format_zeta_prompt,
|
||||||
|
}
|
||||||
|
|
||||||
|
def sample(self, tokenizer: PreTrainedTokenizerBase, num_requests: int, **kwargs):
|
||||||
|
formatting_prompt_func = self.MAPPING_PROMPT_FUNCS.get(self.dataset_path)
|
||||||
|
if formatting_prompt_func is None:
|
||||||
|
raise ValueError(f"Unsupported dataset path: {self.dataset_path}")
|
||||||
|
samples = []
|
||||||
|
for sample in self.data:
|
||||||
|
sample = formatting_prompt_func(sample)
|
||||||
|
samples.append(
|
||||||
|
SampleRequest(
|
||||||
|
prompt=sample["prompt"],
|
||||||
|
prompt_len=len(tokenizer(sample["prompt"]).input_ids),
|
||||||
|
expected_output_len=len(
|
||||||
|
tokenizer(sample["expected_output"]).input_ids
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
if len(samples) >= num_requests:
|
||||||
|
break
|
||||||
|
self.maybe_oversample_requests(samples, num_requests)
|
||||||
|
return samples
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
# ASR Dataset Implementation
|
# ASR Dataset Implementation
|
||||||
# -----------------------------------------------------------------------------
|
# -----------------------------------------------------------------------------
|
||||||
@ -842,18 +1107,22 @@ class ASRDataset(HuggingFaceDataset):
|
|||||||
| AMI | Meetings | Spontaneous | ihm, sdm |
|
| AMI | Meetings | Spontaneous | ihm, sdm |
|
||||||
+----------------+----------------------------------------+--------------------------+-----------------------------+
|
+----------------+----------------------------------------+--------------------------+-----------------------------+
|
||||||
|
|
||||||
""" # noqa: E501
|
""" # noqa: E501
|
||||||
|
|
||||||
SUPPORTED_DATASET_PATHS = {
|
SUPPORTED_DATASET_PATHS = {
|
||||||
"openslr/librispeech_asr", "facebook/voxpopuli", "LIUM/tedlium",
|
"openslr/librispeech_asr",
|
||||||
"edinburghcstr/ami", "speechcolab/gigaspeech", "kensho/spgispeech"
|
"facebook/voxpopuli",
|
||||||
|
"LIUM/tedlium",
|
||||||
|
"edinburghcstr/ami",
|
||||||
|
"speechcolab/gigaspeech",
|
||||||
|
"kensho/spgispeech",
|
||||||
}
|
}
|
||||||
|
|
||||||
DEFAULT_OUTPUT_LEN = 128
|
DEFAULT_OUTPUT_LEN = 128
|
||||||
IS_MULTIMODAL = True
|
IS_MULTIMODAL = True
|
||||||
|
|
||||||
# TODO Whisper-specific. Abstract interface when more models are supported.
|
# TODO Whisper-specific. Abstract interface when more models are supported.
|
||||||
TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|>"\
|
TRANSCRIPTION_PREAMBLE = "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>"
|
||||||
"<|notimestamps|>"
|
|
||||||
skip_long_audios: bool = True
|
skip_long_audios: bool = True
|
||||||
|
|
||||||
def sample(
|
def sample(
|
||||||
@ -864,8 +1133,8 @@ class ASRDataset(HuggingFaceDataset):
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
) -> list:
|
) -> list:
|
||||||
import librosa
|
import librosa
|
||||||
output_len = (output_len
|
|
||||||
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
|
output_len = output_len if output_len is not None else self.DEFAULT_OUTPUT_LEN
|
||||||
prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
|
prompt = ASRDataset.TRANSCRIPTION_PREAMBLE
|
||||||
prompt_len = len(tokenizer(prompt).input_ids)
|
prompt_len = len(tokenizer(prompt).input_ids)
|
||||||
sampled_requests = []
|
sampled_requests = []
|
||||||
@ -888,10 +1157,14 @@ class ASRDataset(HuggingFaceDataset):
|
|||||||
prompt_len=prompt_len,
|
prompt_len=prompt_len,
|
||||||
expected_output_len=output_len,
|
expected_output_len=output_len,
|
||||||
multi_modal_data=mm_content,
|
multi_modal_data=mm_content,
|
||||||
))
|
)
|
||||||
|
)
|
||||||
if skipped:
|
if skipped:
|
||||||
logger.warning("%d samples discarded from dataset due to" \
|
logger.warning(
|
||||||
" their length being greater than" \
|
"%d samples discarded from dataset due to"
|
||||||
" what Whisper supports.", skipped)
|
" their length being greater than"
|
||||||
|
" what Whisper supports.",
|
||||||
|
skipped,
|
||||||
|
)
|
||||||
self.maybe_oversample_requests(sampled_requests, num_requests)
|
self.maybe_oversample_requests(sampled_requests, num_requests)
|
||||||
return sampled_requests
|
return sampled_requests
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""Benchmark the latency of processing a single batch of requests."""
|
"""Benchmark the latency of processing a single batch of requests."""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
@ -6,14 +7,13 @@ import dataclasses
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any, Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
|
||||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
import vllm.envs as envs
|
||||||
|
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.inputs import PromptType
|
from vllm.inputs import PromptType
|
||||||
@ -21,13 +21,14 @@ from vllm.sampling_params import BeamSearchParams
|
|||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
|
def save_to_pytorch_benchmark_format(
|
||||||
results: dict[str, Any]) -> None:
|
args: argparse.Namespace, results: dict[str, Any]
|
||||||
|
) -> None:
|
||||||
pt_records = convert_to_pytorch_benchmark_format(
|
pt_records = convert_to_pytorch_benchmark_format(
|
||||||
args=args,
|
args=args,
|
||||||
metrics={"latency": results["latencies"]},
|
metrics={"latency": results["latencies"]},
|
||||||
extra_info={k: results[k]
|
extra_info={k: results[k] for k in ["avg_latency", "percentiles"]},
|
||||||
for k in ["avg_latency", "percentiles"]})
|
)
|
||||||
if pt_records:
|
if pt_records:
|
||||||
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
|
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
|
||||||
write_to_json(pt_file, pt_records)
|
write_to_json(pt_file, pt_records)
|
||||||
@ -42,9 +43,11 @@ def main(args: argparse.Namespace):
|
|||||||
# the engine will automatically process the request in multiple batches.
|
# the engine will automatically process the request in multiple batches.
|
||||||
llm = LLM(**dataclasses.asdict(engine_args))
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
assert llm.llm_engine.model_config.max_model_len >= (
|
assert llm.llm_engine.model_config.max_model_len >= (
|
||||||
args.input_len +
|
args.input_len + args.output_len
|
||||||
args.output_len), ("Please ensure that max_model_len is greater than"
|
), (
|
||||||
" the sum of input_len and output_len.")
|
"Please ensure that max_model_len is greater than"
|
||||||
|
" the sum of input_len and output_len."
|
||||||
|
)
|
||||||
|
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
n=args.n,
|
n=args.n,
|
||||||
@ -55,18 +58,16 @@ def main(args: argparse.Namespace):
|
|||||||
detokenize=not args.disable_detokenize,
|
detokenize=not args.disable_detokenize,
|
||||||
)
|
)
|
||||||
print(sampling_params)
|
print(sampling_params)
|
||||||
dummy_prompt_token_ids = np.random.randint(10000,
|
dummy_prompt_token_ids = np.random.randint(
|
||||||
size=(args.batch_size,
|
10000, size=(args.batch_size, args.input_len)
|
||||||
args.input_len))
|
)
|
||||||
dummy_prompts: list[PromptType] = [{
|
dummy_prompts: list[PromptType] = [
|
||||||
"prompt_token_ids": batch
|
{"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
|
||||||
} for batch in dummy_prompt_token_ids.tolist()]
|
]
|
||||||
|
|
||||||
def llm_generate():
|
def llm_generate():
|
||||||
if not args.use_beam_search:
|
if not args.use_beam_search:
|
||||||
llm.generate(dummy_prompts,
|
llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
|
||||||
sampling_params=sampling_params,
|
|
||||||
use_tqdm=False)
|
|
||||||
else:
|
else:
|
||||||
llm.beam_search(
|
llm.beam_search(
|
||||||
dummy_prompts,
|
dummy_prompts,
|
||||||
@ -79,16 +80,9 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
def run_to_completion(profile_dir: Optional[str] = None):
|
def run_to_completion(profile_dir: Optional[str] = None):
|
||||||
if profile_dir:
|
if profile_dir:
|
||||||
with torch.profiler.profile(
|
llm.start_profile()
|
||||||
activities=[
|
llm_generate()
|
||||||
torch.profiler.ProfilerActivity.CPU,
|
llm.stop_profile()
|
||||||
torch.profiler.ProfilerActivity.CUDA,
|
|
||||||
],
|
|
||||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
|
||||||
str(profile_dir)),
|
|
||||||
) as p:
|
|
||||||
llm_generate()
|
|
||||||
print(p.key_averages().table(sort_by="self_cuda_time_total"))
|
|
||||||
else:
|
else:
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
llm_generate()
|
llm_generate()
|
||||||
@ -101,10 +95,7 @@ def main(args: argparse.Namespace):
|
|||||||
run_to_completion(profile_dir=None)
|
run_to_completion(profile_dir=None)
|
||||||
|
|
||||||
if args.profile:
|
if args.profile:
|
||||||
profile_dir = args.profile_result_dir
|
profile_dir = envs.VLLM_TORCH_PROFILER_DIR
|
||||||
if not profile_dir:
|
|
||||||
profile_dir = (Path(".") / "vllm_benchmark_result" /
|
|
||||||
f"latency_result_{time.time()}")
|
|
||||||
print(f"Profiling (results will be saved to '{profile_dir}')...")
|
print(f"Profiling (results will be saved to '{profile_dir}')...")
|
||||||
run_to_completion(profile_dir=profile_dir)
|
run_to_completion(profile_dir=profile_dir)
|
||||||
return
|
return
|
||||||
@ -132,10 +123,11 @@ def main(args: argparse.Namespace):
|
|||||||
save_to_pytorch_benchmark_format(args, results)
|
save_to_pytorch_benchmark_format(args, results)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
def create_argument_parser():
|
||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description="Benchmark the latency of processing a single batch of "
|
description="Benchmark the latency of processing a single batch of "
|
||||||
"requests till completion.")
|
"requests till completion."
|
||||||
|
)
|
||||||
parser.add_argument("--input-len", type=int, default=32)
|
parser.add_argument("--input-len", type=int, default=32)
|
||||||
parser.add_argument("--output-len", type=int, default=128)
|
parser.add_argument("--output-len", type=int, default=128)
|
||||||
parser.add_argument("--batch-size", type=int, default=8)
|
parser.add_argument("--batch-size", type=int, default=8)
|
||||||
@ -152,22 +144,14 @@ if __name__ == "__main__":
|
|||||||
default=10,
|
default=10,
|
||||||
help="Number of iterations to run for warmup.",
|
help="Number of iterations to run for warmup.",
|
||||||
)
|
)
|
||||||
parser.add_argument("--num-iters",
|
parser.add_argument(
|
||||||
type=int,
|
"--num-iters", type=int, default=30, help="Number of iterations to run."
|
||||||
default=30,
|
)
|
||||||
help="Number of iterations to run.")
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--profile",
|
"--profile",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="profile the generation process of a single batch",
|
help="profile the generation process of a single batch",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
|
||||||
"--profile-result-dir",
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help=("path to save the pytorch profiler output. Can be visualized "
|
|
||||||
"with ui.perfetto.dev or Tensorboard."),
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--output-json",
|
"--output-json",
|
||||||
type=str,
|
type=str,
|
||||||
@ -177,10 +161,26 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--disable-detokenize",
|
"--disable-detokenize",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help=("Do not detokenize responses (i.e. do not include "
|
help=(
|
||||||
"detokenization time in the latency measurement)"),
|
"Do not detokenize responses (i.e. do not include "
|
||||||
|
"detokenization time in the latency measurement)"
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
parser = EngineArgs.add_cli_args(parser)
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
|
# V1 enables prefix caching by default which skews the latency
|
||||||
|
# numbers. We need to disable prefix caching by default.
|
||||||
|
parser.set_defaults(enable_prefix_caching=False)
|
||||||
|
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = create_argument_parser()
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
|
||||||
|
raise OSError(
|
||||||
|
"The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
|
||||||
|
"Please set it to a valid path to use torch profiler."
|
||||||
|
)
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""
|
"""
|
||||||
Offline benchmark to test the long document QA throughput.
|
Offline benchmark to test the long document QA throughput.
|
||||||
|
|
||||||
@ -76,7 +77,7 @@ def repeat_prompts(prompts, repeat_count, mode: str):
|
|||||||
- 'random': Shuffle the prompts randomly after repetition.
|
- 'random': Shuffle the prompts randomly after repetition.
|
||||||
- 'tile': Repeat the entire prompt list in sequence.
|
- 'tile': Repeat the entire prompt list in sequence.
|
||||||
Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3].
|
Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3].
|
||||||
- 'interleave': Repeat each prompt consecutively before moving to
|
- 'interleave': Repeat each prompt consecutively before moving to
|
||||||
the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3].
|
the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3].
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@ -86,20 +87,21 @@ def repeat_prompts(prompts, repeat_count, mode: str):
|
|||||||
ValueError: If an invalid mode is provided.
|
ValueError: If an invalid mode is provided.
|
||||||
"""
|
"""
|
||||||
print("Repeat mode: ", mode)
|
print("Repeat mode: ", mode)
|
||||||
if mode == 'random':
|
if mode == "random":
|
||||||
repeated_prompts = prompts * repeat_count
|
repeated_prompts = prompts * repeat_count
|
||||||
random.shuffle(repeated_prompts)
|
random.shuffle(repeated_prompts)
|
||||||
return repeated_prompts
|
return repeated_prompts
|
||||||
elif mode == 'tile':
|
elif mode == "tile":
|
||||||
return prompts * repeat_count
|
return prompts * repeat_count
|
||||||
elif mode == 'interleave':
|
elif mode == "interleave":
|
||||||
repeated_prompts = []
|
repeated_prompts = []
|
||||||
for prompt in prompts:
|
for prompt in prompts:
|
||||||
repeated_prompts.extend([prompt] * repeat_count)
|
repeated_prompts.extend([prompt] * repeat_count)
|
||||||
return repeated_prompts
|
return repeated_prompts
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Invalid mode: {mode}, only support "
|
raise ValueError(
|
||||||
"'random', 'tile', 'interleave'")
|
f"Invalid mode: {mode}, only support 'random', 'tile', 'interleave'"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
@ -109,16 +111,16 @@ def main(args):
|
|||||||
# we append the document id at the beginning to avoid any of the document
|
# we append the document id at the beginning to avoid any of the document
|
||||||
# being the prefix of other documents
|
# being the prefix of other documents
|
||||||
prompts = [
|
prompts = [
|
||||||
str(i) + ' '.join(['hi'] * args.document_length)
|
str(i) + " ".join(["hi"] * args.document_length)
|
||||||
for i in range(args.num_documents)
|
for i in range(args.num_documents)
|
||||||
]
|
]
|
||||||
|
|
||||||
prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode)
|
prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode)
|
||||||
|
|
||||||
warmup_prompts = [
|
warmup_prompts = [
|
||||||
"This is warm up request " + str(i) + \
|
"This is warm up request " + str(i) + " ".join(["hi"] * args.document_length)
|
||||||
' '.join(['hi'] * args.document_length)
|
for i in range(args.num_documents)
|
||||||
for i in range(args.num_documents)]
|
]
|
||||||
|
|
||||||
# Create the LLM engine
|
# Create the LLM engine
|
||||||
engine_args = EngineArgs.from_cli_args(args)
|
engine_args = EngineArgs.from_cli_args(args)
|
||||||
@ -140,45 +142,61 @@ def main(args):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
def create_argument_parser():
|
||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description=
|
description="Benchmark the performance with or "
|
||||||
'Benchmark the performance with or without automatic prefix caching.')
|
"without automatic prefix caching."
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--document-length',
|
"--document-length",
|
||||||
type=int,
|
type=int,
|
||||||
# Roughly the number of tokens for a system paper,
|
# Roughly the number of tokens for a system paper,
|
||||||
# excluding images
|
# excluding images
|
||||||
default=20000,
|
default=20000,
|
||||||
help='Range of input lengths for sampling prompts,'
|
help="Range of input lengths for sampling prompts, "
|
||||||
'specified as "min:max" (e.g., "128:256").')
|
'specified as "min:max" (e.g., "128:256").',
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument('--num-documents',
|
parser.add_argument(
|
||||||
type=int,
|
"--num-documents",
|
||||||
default=8,
|
type=int,
|
||||||
help='Range of input lengths for sampling prompts,'
|
default=8,
|
||||||
'specified as "min:max" (e.g., "128:256").')
|
help="Range of input lengths for sampling prompts, "
|
||||||
|
'specified as "min:max" (e.g., "128:256").',
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument('--output-len', type=int, default=10)
|
parser.add_argument("--output-len", type=int, default=10)
|
||||||
|
|
||||||
parser.add_argument('--repeat-count',
|
parser.add_argument(
|
||||||
type=int,
|
"--repeat-count",
|
||||||
default=2,
|
type=int,
|
||||||
help='Number of times to repeat each prompt')
|
default=2,
|
||||||
|
help="Number of times to repeat each prompt",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument("--repeat-mode",
|
parser.add_argument(
|
||||||
type=str,
|
"--repeat-mode",
|
||||||
default='random',
|
type=str,
|
||||||
help='The mode to repeat prompts. The supported '
|
default="random",
|
||||||
'modes are "random", "tile", and "interleave". '
|
help="The mode to repeat prompts. The supported "
|
||||||
'See repeat_prompts() in the source code for details.')
|
'modes are "random", "tile", and "interleave". '
|
||||||
|
"See repeat_prompts() in the source code for details.",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument("--shuffle-seed",
|
parser.add_argument(
|
||||||
type=int,
|
"--shuffle-seed",
|
||||||
default=0,
|
type=int,
|
||||||
help='Random seed when the repeat mode is "random"')
|
default=0,
|
||||||
|
help='Random seed when the repeat mode is "random"',
|
||||||
|
)
|
||||||
|
|
||||||
parser = EngineArgs.add_cli_args(parser)
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
|
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = create_argument_parser()
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""
|
"""
|
||||||
Benchmark the efficiency of prefix caching.
|
Benchmark the efficiency of prefix caching.
|
||||||
|
|
||||||
@ -63,14 +64,15 @@ class Request:
|
|||||||
output_len: int
|
output_len: int
|
||||||
|
|
||||||
|
|
||||||
def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> str:
|
def sample_tokens(tokenizer: PreTrainedTokenizerBase, length: int) -> list[int]:
|
||||||
vocab = tokenizer.get_vocab()
|
vocab = tokenizer.get_vocab()
|
||||||
|
all_special_ids = set(tokenizer.all_special_ids)
|
||||||
|
|
||||||
# Remove the special tokens.
|
# Remove the special tokens.
|
||||||
vocab = {
|
return random.choices(
|
||||||
k: v
|
[v for k, v in vocab.items() if k not in all_special_ids],
|
||||||
for k, v in vocab.items() if k not in tokenizer.all_special_ids
|
k=length,
|
||||||
}
|
)
|
||||||
return random.choices(list(vocab.values()), k=length)
|
|
||||||
|
|
||||||
|
|
||||||
def sample_requests_from_dataset(
|
def sample_requests_from_dataset(
|
||||||
@ -89,8 +91,10 @@ def sample_requests_from_dataset(
|
|||||||
# Filter out the conversations with less than 2 turns.
|
# Filter out the conversations with less than 2 turns.
|
||||||
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
||||||
# Only keep the first two turns of each conversation.
|
# Only keep the first two turns of each conversation.
|
||||||
dataset = [(data["conversations"][0]["value"],
|
dataset = [
|
||||||
data["conversations"][1]["value"]) for data in dataset]
|
(data["conversations"][0]["value"], data["conversations"][1]["value"])
|
||||||
|
for data in dataset
|
||||||
|
]
|
||||||
|
|
||||||
# Shuffle the dataset.
|
# Shuffle the dataset.
|
||||||
random.shuffle(dataset)
|
random.shuffle(dataset)
|
||||||
@ -111,8 +115,9 @@ def sample_requests_from_dataset(
|
|||||||
completion = dataset[i][1]
|
completion = dataset[i][1]
|
||||||
completion_token_ids = tokenizer(completion).input_ids
|
completion_token_ids = tokenizer(completion).input_ids
|
||||||
prompt_len = len(prompt_token_ids)
|
prompt_len = len(prompt_token_ids)
|
||||||
output_len = (len(completion_token_ids)
|
output_len = (
|
||||||
if fixed_output_len is None else fixed_output_len)
|
len(completion_token_ids) if fixed_output_len is None else fixed_output_len
|
||||||
|
)
|
||||||
if min_len <= prompt_len <= max_len:
|
if min_len <= prompt_len <= max_len:
|
||||||
filtered_requests.append(Request(prompt, prompt_len, output_len))
|
filtered_requests.append(Request(prompt, prompt_len, output_len))
|
||||||
|
|
||||||
@ -126,27 +131,27 @@ def sample_requests_from_random(
|
|||||||
fixed_output_len: Optional[int],
|
fixed_output_len: Optional[int],
|
||||||
prefix_len: int,
|
prefix_len: int,
|
||||||
) -> list[Request]:
|
) -> list[Request]:
|
||||||
|
|
||||||
requests = []
|
requests = []
|
||||||
prefix_token_ids = sample_tokens(tokenizer, prefix_len)
|
prefix_token_ids = sample_tokens(tokenizer, prefix_len)
|
||||||
min_len, max_len = input_length_range
|
min_len, max_len = input_length_range
|
||||||
|
|
||||||
for i in range(num_requests):
|
for i in range(num_requests):
|
||||||
unique_part_token_ids = sample_tokens(
|
unique_part_token_ids = sample_tokens(
|
||||||
tokenizer,
|
tokenizer, random.randint(min_len - prefix_len, max_len - prefix_len)
|
||||||
random.randint(min_len - prefix_len, max_len - prefix_len))
|
)
|
||||||
prompt_token_ids = prefix_token_ids + unique_part_token_ids
|
prompt_token_ids = prefix_token_ids + unique_part_token_ids
|
||||||
prompt = tokenizer.decode(prompt_token_ids)
|
prompt = tokenizer.decode(prompt_token_ids)
|
||||||
prompt_len = len(prompt_token_ids)
|
prompt_len = len(prompt_token_ids)
|
||||||
assert (min_len <= prompt_len <= max_len
|
assert min_len <= prompt_len <= max_len, (
|
||||||
), f"prompt_len {prompt_len} out of range {min_len}:{max_len}"
|
f"prompt_len {prompt_len} out of range {min_len}:{max_len}"
|
||||||
|
)
|
||||||
requests.append(Request(prompt, prompt_len, fixed_output_len))
|
requests.append(Request(prompt, prompt_len, fixed_output_len))
|
||||||
return requests
|
return requests
|
||||||
|
|
||||||
|
|
||||||
def repeat_and_sort_requests(requests: list[Request],
|
def repeat_and_sort_requests(
|
||||||
repeat_count: int,
|
requests: list[Request], repeat_count: int, sort: bool = False
|
||||||
sort: bool = False) -> list[str]:
|
) -> list[str]:
|
||||||
repeated_requests = requests * repeat_count
|
repeated_requests = requests * repeat_count
|
||||||
if sort:
|
if sort:
|
||||||
repeated_requests.sort(key=lambda x: x[1])
|
repeated_requests.sort(key=lambda x: x[1])
|
||||||
@ -157,14 +162,14 @@ def repeat_and_sort_requests(requests: list[Request],
|
|||||||
|
|
||||||
def main(args):
|
def main(args):
|
||||||
tokenizer = get_tokenizer(args.model, trust_remote_code=True)
|
tokenizer = get_tokenizer(args.model, trust_remote_code=True)
|
||||||
input_length_range = tuple(map(int, args.input_length_range.split(':')))
|
input_length_range = tuple(map(int, args.input_length_range.split(":")))
|
||||||
random.seed(args.seed)
|
random.seed(args.seed)
|
||||||
if args.dataset_path is not None:
|
if args.dataset_path is not None:
|
||||||
if args.prefix_len > 0:
|
if args.prefix_len > 0:
|
||||||
raise ValueError("prefix-len is not supported when "
|
raise ValueError(
|
||||||
"dataset-path is provided.")
|
"prefix-len is not supported when dataset-path is provided."
|
||||||
print(f"Start to sample {args.num_prompts} prompts "
|
)
|
||||||
f"from {args.dataset_path}")
|
print(f"Start to sample {args.num_prompts} prompts from {args.dataset_path}")
|
||||||
filtered_requests = sample_requests_from_dataset(
|
filtered_requests = sample_requests_from_dataset(
|
||||||
dataset_path=args.dataset_path,
|
dataset_path=args.dataset_path,
|
||||||
num_requests=args.num_prompts,
|
num_requests=args.num_prompts,
|
||||||
@ -194,14 +199,16 @@ def main(args):
|
|||||||
|
|
||||||
llm = LLM(**dataclasses.asdict(engine_args))
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
|
|
||||||
sampling_params = SamplingParams(temperature=0,
|
sampling_params = SamplingParams(
|
||||||
max_tokens=args.output_len,
|
temperature=0,
|
||||||
detokenize=not args.disable_detokenize)
|
max_tokens=args.output_len,
|
||||||
|
detokenize=not args.disable_detokenize,
|
||||||
|
)
|
||||||
|
|
||||||
print("Testing filtered requests")
|
print("Testing filtered requests")
|
||||||
prompts = repeat_and_sort_requests(filtered_requests,
|
prompts = repeat_and_sort_requests(
|
||||||
repeat_count=args.repeat_count,
|
filtered_requests, repeat_count=args.repeat_count, sort=args.sort
|
||||||
sort=args.sort)
|
)
|
||||||
|
|
||||||
print("------start generating------")
|
print("------start generating------")
|
||||||
test_prefix(
|
test_prefix(
|
||||||
@ -211,31 +218,37 @@ def main(args):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
def create_argument_parser():
|
||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description=
|
description="Benchmark the performance with or without "
|
||||||
'Benchmark the performance with or without automatic prefix caching.')
|
"automatic prefix caching."
|
||||||
parser.add_argument("--dataset-path",
|
)
|
||||||
type=str,
|
parser.add_argument(
|
||||||
default=None,
|
"--dataset-path", type=str, default=None, help="Path to the dataset."
|
||||||
help="Path to the dataset.")
|
)
|
||||||
parser.add_argument('--output-len', type=int, default=10)
|
parser.add_argument("--output-len", type=int, default=10)
|
||||||
parser.add_argument('--num-prompts',
|
parser.add_argument(
|
||||||
type=int,
|
"--num-prompts",
|
||||||
required=True,
|
type=int,
|
||||||
help="Number of the prompts sampled from dataset")
|
required=True,
|
||||||
parser.add_argument('--repeat-count',
|
help="Number of the prompts sampled from dataset",
|
||||||
type=int,
|
)
|
||||||
default=1,
|
parser.add_argument(
|
||||||
help='Number of times to repeat each prompt')
|
"--repeat-count",
|
||||||
parser.add_argument('--sort',
|
type=int,
|
||||||
action='store_true',
|
default=1,
|
||||||
help='Sort prompts by input length')
|
help="Number of times to repeat each prompt",
|
||||||
parser.add_argument('--input-length-range',
|
)
|
||||||
type=str,
|
parser.add_argument(
|
||||||
required=True,
|
"--sort", action="store_true", help="Sort prompts by input length"
|
||||||
help='Range of input lengths for sampling prompts,'
|
)
|
||||||
'specified as "min:max" (e.g., "128:256").')
|
parser.add_argument(
|
||||||
|
"--input-length-range",
|
||||||
|
type=str,
|
||||||
|
required=True,
|
||||||
|
help="Range of input lengths for sampling prompts,"
|
||||||
|
'specified as "min:max" (e.g., "128:256").',
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--prefix-len",
|
"--prefix-len",
|
||||||
type=int,
|
type=int,
|
||||||
@ -246,12 +259,20 @@ if __name__ == "__main__":
|
|||||||
"when dataset-path is not provided.",
|
"when dataset-path is not provided.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--disable-detokenize',
|
"--disable-detokenize",
|
||||||
action='store_true',
|
action="store_true",
|
||||||
help=("Do not detokenize responses (i.e. do not include "
|
help=(
|
||||||
"detokenization time in the latency measurement)"),
|
"Do not detokenize responses (i.e. do not include "
|
||||||
|
"detokenization time in the latency measurement)"
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
parser = EngineArgs.add_cli_args(parser)
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
|
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = create_argument_parser()
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""Benchmark offline prioritization."""
|
"""Benchmark offline prioritization."""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import dataclasses
|
import dataclasses
|
||||||
import json
|
import json
|
||||||
@ -13,7 +15,7 @@ from vllm.engine.arg_utils import EngineArgs
|
|||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
#Select a equi-probable random priority
|
# Select a equi-probable random priority
|
||||||
def get_random_flag():
|
def get_random_flag():
|
||||||
return 0 if random.random() < 0.5 else 1
|
return 0 if random.random() < 0.5 else 1
|
||||||
|
|
||||||
@ -33,8 +35,10 @@ def sample_requests(
|
|||||||
# Filter out the conversations with less than 2 turns.
|
# Filter out the conversations with less than 2 turns.
|
||||||
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
||||||
# Only keep the first two turns of each conversation.
|
# Only keep the first two turns of each conversation.
|
||||||
dataset = [(data["conversations"][0]["value"],
|
dataset = [
|
||||||
data["conversations"][1]["value"]) for data in dataset]
|
(data["conversations"][0]["value"], data["conversations"][1]["value"])
|
||||||
|
for data in dataset
|
||||||
|
]
|
||||||
|
|
||||||
# Shuffle the dataset.
|
# Shuffle the dataset.
|
||||||
random.shuffle(dataset)
|
random.shuffle(dataset)
|
||||||
@ -51,8 +55,9 @@ def sample_requests(
|
|||||||
completion = dataset[i][1]
|
completion = dataset[i][1]
|
||||||
completion_token_ids = tokenizer(completion).input_ids
|
completion_token_ids = tokenizer(completion).input_ids
|
||||||
prompt_len = len(prompt_token_ids)
|
prompt_len = len(prompt_token_ids)
|
||||||
output_len = len(completion_token_ids
|
output_len = (
|
||||||
) if fixed_output_len is None else fixed_output_len
|
len(completion_token_ids) if fixed_output_len is None else fixed_output_len
|
||||||
|
)
|
||||||
if prompt_len < 4 or output_len < 4:
|
if prompt_len < 4 or output_len < 4:
|
||||||
# Prune too short sequences.
|
# Prune too short sequences.
|
||||||
continue
|
continue
|
||||||
@ -74,13 +79,16 @@ def run_vllm(
|
|||||||
disable_detokenize: bool = False,
|
disable_detokenize: bool = False,
|
||||||
) -> float:
|
) -> float:
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
llm = LLM(**dataclasses.asdict(engine_args))
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
|
|
||||||
assert all(
|
assert all(
|
||||||
llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])
|
llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])
|
||||||
for request in requests), (
|
for request in requests
|
||||||
"Please ensure that max_model_len is greater than the sum of"
|
), (
|
||||||
" input_len and output_len for all requests.")
|
"Please ensure that max_model_len is greater than the sum of"
|
||||||
|
" input_len and output_len for all requests."
|
||||||
|
)
|
||||||
|
|
||||||
# Add the requests to the engine.
|
# Add the requests to the engine.
|
||||||
prompts = []
|
prompts = []
|
||||||
@ -97,7 +105,8 @@ def run_vllm(
|
|||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
max_tokens=output_len,
|
max_tokens=output_len,
|
||||||
detokenize=not disable_detokenize,
|
detokenize=not disable_detokenize,
|
||||||
))
|
)
|
||||||
|
)
|
||||||
|
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
llm.generate(prompts, sampling_params, priority=priority, use_tqdm=True)
|
llm.generate(prompts, sampling_params, priority=priority, use_tqdm=True)
|
||||||
@ -111,26 +120,33 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
# Sample the requests.
|
# Sample the requests.
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
args.tokenizer, trust_remote_code=args.trust_remote_code)
|
args.tokenizer, trust_remote_code=args.trust_remote_code
|
||||||
|
)
|
||||||
if args.dataset is None:
|
if args.dataset is None:
|
||||||
# Synthesize a prompt with the given input length.
|
# Synthesize a prompt with the given input length.
|
||||||
prompt = "hi" * (args.input_len - 1)
|
prompt = "hi" * (args.input_len - 1)
|
||||||
requests = [(prompt, args.input_len, args.output_len,
|
requests = [
|
||||||
get_random_flag()) for _ in range(args.num_prompts)]
|
(prompt, args.input_len, args.output_len, get_random_flag())
|
||||||
|
for _ in range(args.num_prompts)
|
||||||
|
]
|
||||||
else:
|
else:
|
||||||
requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
|
requests = sample_requests(
|
||||||
args.output_len)
|
args.dataset, args.num_prompts, tokenizer, args.output_len
|
||||||
|
)
|
||||||
|
|
||||||
if args.backend == "vllm":
|
if args.backend == "vllm":
|
||||||
elapsed_time = run_vllm(requests, args.n,
|
elapsed_time = run_vllm(
|
||||||
EngineArgs.from_cli_args(args),
|
requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize
|
||||||
args.disable_detokenize)
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown backend: {args.backend}")
|
raise ValueError(f"Unknown backend: {args.backend}")
|
||||||
total_num_tokens = sum(prompt_len + output_len
|
total_num_tokens = sum(
|
||||||
for _, prompt_len, output_len, priority in requests)
|
prompt_len + output_len for _, prompt_len, output_len, priority in requests
|
||||||
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
)
|
||||||
f"{total_num_tokens / elapsed_time:.2f} tokens/s")
|
print(
|
||||||
|
f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
||||||
|
f"{total_num_tokens / elapsed_time:.2f} tokens/s"
|
||||||
|
)
|
||||||
|
|
||||||
# Output JSON results if specified
|
# Output JSON results if specified
|
||||||
if args.output_json:
|
if args.output_json:
|
||||||
@ -145,46 +161,55 @@ def main(args: argparse.Namespace):
|
|||||||
json.dump(results, f, indent=4)
|
json.dump(results, f, indent=4)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
def create_argument_parser():
|
||||||
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
|
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
|
||||||
parser.add_argument("--backend",
|
|
||||||
type=str,
|
|
||||||
choices=["vllm", "hf", "mii"],
|
|
||||||
default="vllm")
|
|
||||||
parser.add_argument("--dataset",
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help="Path to the dataset.")
|
|
||||||
parser.add_argument("--input-len",
|
|
||||||
type=int,
|
|
||||||
default=None,
|
|
||||||
help="Input prompt length for each request")
|
|
||||||
parser.add_argument("--output-len",
|
|
||||||
type=int,
|
|
||||||
default=None,
|
|
||||||
help="Output length for each request. Overrides the "
|
|
||||||
"output length from the dataset.")
|
|
||||||
parser.add_argument("--n",
|
|
||||||
type=int,
|
|
||||||
default=1,
|
|
||||||
help="Number of generated sequences per prompt.")
|
|
||||||
parser.add_argument("--num-prompts",
|
|
||||||
type=int,
|
|
||||||
default=200,
|
|
||||||
help="Number of prompts to process.")
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--output-json',
|
"--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--dataset", type=str, default=None, help="Path to the dataset."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--input-len",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Input prompt length for each request",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-len",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Output length for each request. Overrides the "
|
||||||
|
"output length from the dataset.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--n", type=int, default=1, help="Number of generated sequences per prompt."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--num-prompts", type=int, default=200, help="Number of prompts to process."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-json",
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help='Path to save the throughput results in JSON format.')
|
help="Path to save the throughput results in JSON format.",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--disable-detokenize',
|
"--disable-detokenize",
|
||||||
action='store_true',
|
action="store_true",
|
||||||
help=("Do not detokenize responses (i.e. do not include "
|
help=(
|
||||||
"detokenization time in the latency measurement)"),
|
"Do not detokenize responses (i.e. do not include "
|
||||||
|
"detokenization time in the latency measurement)"
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
parser = EngineArgs.add_cli_args(parser)
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
|
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = create_argument_parser()
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.tokenizer is None:
|
if args.tokenizer is None:
|
||||||
args.tokenizer = args.model
|
args.tokenizer = args.model
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
r"""Benchmark online serving throughput with structured outputs.
|
r"""Benchmark online serving throughput with structured outputs.
|
||||||
|
|
||||||
On the server side, run one of the following commands:
|
On the server side, run one of the following commands:
|
||||||
@ -11,7 +12,6 @@ On the client side, run:
|
|||||||
--model <your_model> \
|
--model <your_model> \
|
||||||
--dataset json \
|
--dataset json \
|
||||||
--structured-output-ratio 1.0 \
|
--structured-output-ratio 1.0 \
|
||||||
--structured-output-backend auto \
|
|
||||||
--request-rate 10 \
|
--request-rate 10 \
|
||||||
--num-prompts 1000
|
--num-prompts 1000
|
||||||
|
|
||||||
@ -19,6 +19,7 @@ On the client side, run:
|
|||||||
--endpoint /generate_stream
|
--endpoint /generate_stream
|
||||||
to the end of the command above.
|
to the end of the command above.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import asyncio
|
import asyncio
|
||||||
import copy
|
import copy
|
||||||
@ -36,11 +37,15 @@ from typing import Optional
|
|||||||
import datasets
|
import datasets
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
|
|
||||||
RequestFuncOutput)
|
|
||||||
from tqdm.asyncio import tqdm
|
from tqdm.asyncio import tqdm
|
||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
|
from backend_request_func import (
|
||||||
|
ASYNC_REQUEST_FUNCS,
|
||||||
|
RequestFuncInput,
|
||||||
|
RequestFuncOutput,
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@ -52,7 +57,8 @@ except ImportError:
|
|||||||
from argparse import ArgumentParser as FlexibleArgumentParser
|
from argparse import ArgumentParser as FlexibleArgumentParser
|
||||||
|
|
||||||
from vllm.v1.structured_output.backend_xgrammar import (
|
from vllm.v1.structured_output.backend_xgrammar import (
|
||||||
has_xgrammar_unsupported_json_features)
|
has_xgrammar_unsupported_json_features,
|
||||||
|
)
|
||||||
|
|
||||||
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
|
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
|
||||||
|
|
||||||
@ -98,6 +104,7 @@ class SampleRequest:
|
|||||||
prompt_len: The length of the prompt in tokens.
|
prompt_len: The length of the prompt in tokens.
|
||||||
expected_output_len: The expected length of the output in tokens.
|
expected_output_len: The expected length of the output in tokens.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
prompt: str
|
prompt: str
|
||||||
prompt_len: int
|
prompt_len: int
|
||||||
expected_output_len: int
|
expected_output_len: int
|
||||||
@ -106,61 +113,61 @@ class SampleRequest:
|
|||||||
completion: str = None
|
completion: str = None
|
||||||
|
|
||||||
|
|
||||||
def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
def sample_requests(
|
||||||
args: argparse.Namespace) -> list[SampleRequest]:
|
tokenizer: PreTrainedTokenizerBase, args: argparse.Namespace
|
||||||
if args.dataset == 'json' or args.dataset == 'json-unique':
|
) -> list[SampleRequest]:
|
||||||
|
if args.dataset == "json" or args.dataset == "json-unique":
|
||||||
if args.json_schema_path is None:
|
if args.json_schema_path is None:
|
||||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||||
args.json_schema_path = os.path.join(dir_path,
|
args.json_schema_path = os.path.join(
|
||||||
"structured_schemas",
|
dir_path, "structured_schemas", "structured_schema_1.json"
|
||||||
"structured_schema_1.json")
|
)
|
||||||
json_schemas = []
|
json_schemas = []
|
||||||
with open(args.json_schema_path) as f:
|
with open(args.json_schema_path) as f:
|
||||||
schema = json.load(f)
|
schema = json.load(f)
|
||||||
|
|
||||||
if args.dataset == 'json-unique':
|
if args.dataset == "json-unique":
|
||||||
json_schemas = [
|
json_schemas = [copy.deepcopy(schema) for _ in range(args.num_prompts)]
|
||||||
copy.deepcopy(schema) for _ in range(args.num_prompts)
|
|
||||||
]
|
|
||||||
for i in range(len(json_schemas)):
|
for i in range(len(json_schemas)):
|
||||||
json_schemas[i]["properties"][
|
if "properties" not in json_schemas[i]:
|
||||||
f"__optional_field_{uuid.uuid4()}"] = {
|
json_schemas[i]["properties"] = {}
|
||||||
"type":
|
json_schemas[i]["properties"][f"__optional_field_{uuid.uuid4()}"] = {
|
||||||
"string",
|
"type": "string",
|
||||||
"description":
|
"description": "An unique optional field to avoid cached schemas",
|
||||||
"An unique optional field to avoid cached schemas"
|
}
|
||||||
}
|
|
||||||
else:
|
else:
|
||||||
json_schemas = [schema] * args.num_prompts
|
json_schemas = [schema] * args.num_prompts
|
||||||
|
|
||||||
def gen_prompt(index: int):
|
def gen_prompt(index: int):
|
||||||
return f"Generate an example of a user profile given the following schema: {json.dumps(get_schema(index))}" # noqa: E501
|
return f"Generate an example of a brief user profile given the following schema: {json.dumps(get_schema(index))}" # noqa: E501
|
||||||
|
|
||||||
def get_schema(index: int):
|
def get_schema(index: int):
|
||||||
return json_schemas[index % len(json_schemas)]
|
return json_schemas[index % len(json_schemas)]
|
||||||
|
|
||||||
requests = [
|
requests = [
|
||||||
SampleRequest(prompt=gen_prompt(i),
|
SampleRequest(
|
||||||
prompt_len=len(tokenizer(gen_prompt(i)).input_ids),
|
prompt=gen_prompt(i),
|
||||||
expected_output_len=args.output_len,
|
prompt_len=len(tokenizer(gen_prompt(i)).input_ids),
|
||||||
schema=get_schema(i),
|
expected_output_len=args.output_len,
|
||||||
structure_type=args.structure_type)
|
schema=get_schema(i),
|
||||||
|
structure_type=args.structure_type,
|
||||||
|
)
|
||||||
for i in range(args.num_prompts)
|
for i in range(args.num_prompts)
|
||||||
]
|
]
|
||||||
|
|
||||||
elif args.dataset == "grammar":
|
elif args.dataset == "grammar":
|
||||||
schema = """
|
schema = """
|
||||||
?start: select_statement
|
root ::= select_statement
|
||||||
|
|
||||||
?select_statement: "SELECT " column_list " FROM " table_name
|
select_statement ::= "SELECT " column " from " table " where " condition
|
||||||
|
|
||||||
?column_list: column_name ("," column_name)*
|
column ::= "col_1 " | "col_2 "
|
||||||
|
|
||||||
?table_name: identifier
|
table ::= "table_1 " | "table_2 "
|
||||||
|
|
||||||
?column_name: identifier
|
condition ::= column "= " number
|
||||||
|
|
||||||
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
|
number ::= "1 " | "2 "
|
||||||
"""
|
"""
|
||||||
prompt = "Generate an SQL query to show the 'username' \
|
prompt = "Generate an SQL query to show the 'username' \
|
||||||
and 'email' from the 'users' table."
|
and 'email' from the 'users' table."
|
||||||
@ -168,11 +175,13 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
|||||||
input_len = len(tokenizer(prompt).input_ids)
|
input_len = len(tokenizer(prompt).input_ids)
|
||||||
print(f"Input length of the prompt: {input_len} tokens")
|
print(f"Input length of the prompt: {input_len} tokens")
|
||||||
requests = [
|
requests = [
|
||||||
SampleRequest(prompt=prompt,
|
SampleRequest(
|
||||||
prompt_len=input_len,
|
prompt=prompt,
|
||||||
expected_output_len=args.output_len,
|
prompt_len=input_len,
|
||||||
schema=schema,
|
expected_output_len=args.output_len,
|
||||||
structure_type=args.structure_type)
|
schema=schema,
|
||||||
|
structure_type=args.structure_type,
|
||||||
|
)
|
||||||
for _ in range(args.num_prompts)
|
for _ in range(args.num_prompts)
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -186,11 +195,13 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
|||||||
input_len = len(tokenizer(prompt).input_ids)
|
input_len = len(tokenizer(prompt).input_ids)
|
||||||
print(f"Input length of the prompt: {input_len} tokens")
|
print(f"Input length of the prompt: {input_len} tokens")
|
||||||
requests = [
|
requests = [
|
||||||
SampleRequest(prompt=prompt,
|
SampleRequest(
|
||||||
prompt_len=input_len,
|
prompt=prompt,
|
||||||
expected_output_len=args.output_len,
|
prompt_len=input_len,
|
||||||
schema=regex,
|
expected_output_len=args.output_len,
|
||||||
structure_type=args.structure_type)
|
schema=regex,
|
||||||
|
structure_type=args.structure_type,
|
||||||
|
)
|
||||||
for _ in range(args.num_prompts)
|
for _ in range(args.num_prompts)
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -201,47 +212,55 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
|||||||
input_len = len(tokenizer(prompt).input_ids)
|
input_len = len(tokenizer(prompt).input_ids)
|
||||||
print(f"Input length of the prompt: {input_len} tokens")
|
print(f"Input length of the prompt: {input_len} tokens")
|
||||||
requests = [
|
requests = [
|
||||||
SampleRequest(prompt=prompt,
|
SampleRequest(
|
||||||
prompt_len=input_len,
|
prompt=prompt,
|
||||||
expected_output_len=args.output_len,
|
prompt_len=input_len,
|
||||||
schema=choice,
|
expected_output_len=args.output_len,
|
||||||
structure_type=args.structure_type)
|
schema=choice,
|
||||||
|
structure_type=args.structure_type,
|
||||||
|
)
|
||||||
for _ in range(args.num_prompts)
|
for _ in range(args.num_prompts)
|
||||||
]
|
]
|
||||||
|
|
||||||
elif args.dataset == "xgrammar_bench":
|
elif args.dataset == "xgrammar_bench":
|
||||||
requests: list[SampleRequest] = []
|
requests: list[SampleRequest] = []
|
||||||
dataset = datasets.load_dataset("NousResearch/json-mode-eval",
|
dataset = datasets.load_dataset("NousResearch/json-mode-eval", split="train")
|
||||||
split="train")
|
|
||||||
full_dataset_len = len(dataset)
|
full_dataset_len = len(dataset)
|
||||||
|
|
||||||
def _filter_func(item):
|
def _filter_func(item):
|
||||||
import json
|
import json
|
||||||
|
|
||||||
schema = json.loads(item["schema"])
|
schema = json.loads(item["schema"])
|
||||||
return not has_xgrammar_unsupported_json_features(schema)
|
return not has_xgrammar_unsupported_json_features(schema)
|
||||||
|
|
||||||
dataset = dataset.filter(_filter_func)
|
dataset = dataset.filter(_filter_func)
|
||||||
num_filtered_out = full_dataset_len - len(dataset)
|
num_filtered_out = full_dataset_len - len(dataset)
|
||||||
print(f"dataset has {len(dataset)} entries after filtering "
|
print(
|
||||||
f"out {num_filtered_out} entries with unsupported features")
|
f"dataset has {len(dataset)} entries after filtering "
|
||||||
|
f"out {num_filtered_out} entries with unsupported features"
|
||||||
|
)
|
||||||
len_dataset = len(dataset)
|
len_dataset = len(dataset)
|
||||||
for data_point_idx in range(args.num_prompts):
|
for data_point_idx in range(args.num_prompts):
|
||||||
idx = data_point_idx
|
idx = data_point_idx
|
||||||
while idx >= len_dataset:
|
while idx >= len_dataset:
|
||||||
idx -= len_dataset
|
idx -= len_dataset
|
||||||
schema = dataset["schema"][idx]
|
schema = dataset["schema"][idx]
|
||||||
prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
|
prompt = tokenizer.apply_chat_template(
|
||||||
tokenize=False)
|
dataset["prompt"][idx], tokenize=False, add_generation_prompt=True
|
||||||
|
)
|
||||||
input_len = len(tokenizer(prompt).input_ids)
|
input_len = len(tokenizer(prompt).input_ids)
|
||||||
completion = dataset["completion"][idx]
|
completion = dataset["completion"][idx]
|
||||||
|
|
||||||
requests.append(
|
requests.append(
|
||||||
SampleRequest(prompt=prompt,
|
SampleRequest(
|
||||||
prompt_len=input_len,
|
prompt=prompt,
|
||||||
expected_output_len=args.output_len,
|
prompt_len=input_len,
|
||||||
schema=schema,
|
expected_output_len=args.output_len,
|
||||||
structure_type=args.structure_type,
|
schema=schema,
|
||||||
completion=completion))
|
structure_type=args.structure_type,
|
||||||
|
completion=completion,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
return requests
|
return requests
|
||||||
|
|
||||||
@ -273,7 +292,8 @@ async def get_request(
|
|||||||
|
|
||||||
# Calculate scale parameter theta to maintain the desired request_rate.
|
# Calculate scale parameter theta to maintain the desired request_rate.
|
||||||
assert burstiness > 0, (
|
assert burstiness > 0, (
|
||||||
f"A positive burstiness factor is expected, but given {burstiness}.")
|
f"A positive burstiness factor is expected, but given {burstiness}."
|
||||||
|
)
|
||||||
theta = 1.0 / (request_rate * burstiness)
|
theta = 1.0 / (request_rate * burstiness)
|
||||||
|
|
||||||
for i, request in enumerate(input_requests):
|
for i, request in enumerate(input_requests):
|
||||||
@ -315,8 +335,8 @@ def calculate_metrics(
|
|||||||
# multiple output tokens may be bundled together
|
# multiple output tokens may be bundled together
|
||||||
# Note : this may inflate the output token count slightly
|
# Note : this may inflate the output token count slightly
|
||||||
output_len = len(
|
output_len = len(
|
||||||
tokenizer(outputs[i].generated_text,
|
tokenizer(outputs[i].generated_text, add_special_tokens=False).input_ids
|
||||||
add_special_tokens=False).input_ids)
|
)
|
||||||
actual_output_lens.append(output_len)
|
actual_output_lens.append(output_len)
|
||||||
total_input += input_requests[i].prompt_len
|
total_input += input_requests[i].prompt_len
|
||||||
tpot = 0
|
tpot = 0
|
||||||
@ -340,16 +360,19 @@ def calculate_metrics(
|
|||||||
|
|
||||||
if "ttft" in goodput_config_dict:
|
if "ttft" in goodput_config_dict:
|
||||||
valid_metrics.append(ttfts)
|
valid_metrics.append(ttfts)
|
||||||
slo_values.append(goodput_config_dict["ttft"] /
|
slo_values.append(
|
||||||
MILLISECONDS_TO_SECONDS_CONVERSION)
|
goodput_config_dict["ttft"] / MILLISECONDS_TO_SECONDS_CONVERSION
|
||||||
|
)
|
||||||
if "tpot" in goodput_config_dict:
|
if "tpot" in goodput_config_dict:
|
||||||
valid_metrics.append(all_tpots)
|
valid_metrics.append(all_tpots)
|
||||||
slo_values.append(goodput_config_dict["tpot"] /
|
slo_values.append(
|
||||||
MILLISECONDS_TO_SECONDS_CONVERSION)
|
goodput_config_dict["tpot"] / MILLISECONDS_TO_SECONDS_CONVERSION
|
||||||
|
)
|
||||||
if "e2el" in goodput_config_dict:
|
if "e2el" in goodput_config_dict:
|
||||||
valid_metrics.append(e2els)
|
valid_metrics.append(e2els)
|
||||||
slo_values.append(goodput_config_dict["e2el"] /
|
slo_values.append(
|
||||||
MILLISECONDS_TO_SECONDS_CONVERSION)
|
goodput_config_dict["e2el"] / MILLISECONDS_TO_SECONDS_CONVERSION
|
||||||
|
)
|
||||||
|
|
||||||
for req_metric in zip(*valid_metrics):
|
for req_metric in zip(*valid_metrics):
|
||||||
is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
|
is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
|
||||||
@ -360,7 +383,8 @@ def calculate_metrics(
|
|||||||
warnings.warn(
|
warnings.warn(
|
||||||
"All requests failed. This is likely due to a misconfiguration "
|
"All requests failed. This is likely due to a misconfiguration "
|
||||||
"on the benchmark arguments.",
|
"on the benchmark arguments.",
|
||||||
stacklevel=2)
|
stacklevel=2,
|
||||||
|
)
|
||||||
metrics = BenchmarkMetrics(
|
metrics = BenchmarkMetrics(
|
||||||
completed=completed,
|
completed=completed,
|
||||||
total_input=total_input,
|
total_input=total_input,
|
||||||
@ -369,27 +393,31 @@ def calculate_metrics(
|
|||||||
request_goodput=good_completed / dur_s,
|
request_goodput=good_completed / dur_s,
|
||||||
output_throughput=sum(actual_output_lens) / dur_s,
|
output_throughput=sum(actual_output_lens) / dur_s,
|
||||||
total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
|
total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s,
|
||||||
mean_ttft_ms=np.mean(ttfts or 0) *
|
mean_ttft_ms=np.mean(ttfts or 0)
|
||||||
1000, # ttfts is empty if streaming is not supported by backend
|
* 1000, # ttfts is empty if streaming is not supported by backend
|
||||||
std_ttft_ms=np.std(ttfts or 0) * 1000,
|
std_ttft_ms=np.std(ttfts or 0) * 1000,
|
||||||
median_ttft_ms=np.median(ttfts or 0) * 1000,
|
median_ttft_ms=np.median(ttfts or 0) * 1000,
|
||||||
percentiles_ttft_ms=[(p, np.percentile(ttfts or 0, p) * 1000)
|
percentiles_ttft_ms=[
|
||||||
for p in selected_percentiles],
|
(p, np.percentile(ttfts or 0, p) * 1000) for p in selected_percentiles
|
||||||
|
],
|
||||||
mean_tpot_ms=np.mean(tpots or 0) * 1000,
|
mean_tpot_ms=np.mean(tpots or 0) * 1000,
|
||||||
std_tpot_ms=np.std(tpots or 0) * 1000,
|
std_tpot_ms=np.std(tpots or 0) * 1000,
|
||||||
median_tpot_ms=np.median(tpots or 0) * 1000,
|
median_tpot_ms=np.median(tpots or 0) * 1000,
|
||||||
percentiles_tpot_ms=[(p, np.percentile(tpots or 0, p) * 1000)
|
percentiles_tpot_ms=[
|
||||||
for p in selected_percentiles],
|
(p, np.percentile(tpots or 0, p) * 1000) for p in selected_percentiles
|
||||||
|
],
|
||||||
mean_itl_ms=np.mean(itls or 0) * 1000,
|
mean_itl_ms=np.mean(itls or 0) * 1000,
|
||||||
std_itl_ms=np.std(itls or 0) * 1000,
|
std_itl_ms=np.std(itls or 0) * 1000,
|
||||||
median_itl_ms=np.median(itls or 0) * 1000,
|
median_itl_ms=np.median(itls or 0) * 1000,
|
||||||
percentiles_itl_ms=[(p, np.percentile(itls or 0, p) * 1000)
|
percentiles_itl_ms=[
|
||||||
for p in selected_percentiles],
|
(p, np.percentile(itls or 0, p) * 1000) for p in selected_percentiles
|
||||||
|
],
|
||||||
mean_e2el_ms=np.mean(e2els or 0) * 1000,
|
mean_e2el_ms=np.mean(e2els or 0) * 1000,
|
||||||
std_e2el_ms=np.std(e2els or 0) * 1000,
|
std_e2el_ms=np.std(e2els or 0) * 1000,
|
||||||
median_e2el_ms=np.median(e2els or 0) * 1000,
|
median_e2el_ms=np.median(e2els or 0) * 1000,
|
||||||
percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
|
percentiles_e2el_ms=[
|
||||||
for p in selected_percentiles],
|
(p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles
|
||||||
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
return metrics, actual_output_lens
|
return metrics, actual_output_lens
|
||||||
@ -411,7 +439,6 @@ async def benchmark(
|
|||||||
ignore_eos: bool,
|
ignore_eos: bool,
|
||||||
max_concurrency: Optional[int],
|
max_concurrency: Optional[int],
|
||||||
structured_output_ratio: float,
|
structured_output_ratio: float,
|
||||||
structured_output_backend: str,
|
|
||||||
goodput_config_dict: Optional[dict[str, float]] = None,
|
goodput_config_dict: Optional[dict[str, float]] = None,
|
||||||
):
|
):
|
||||||
if backend in ASYNC_REQUEST_FUNCS:
|
if backend in ASYNC_REQUEST_FUNCS:
|
||||||
@ -423,18 +450,17 @@ async def benchmark(
|
|||||||
extra_body = {}
|
extra_body = {}
|
||||||
# Add the schema to the extra_body
|
# Add the schema to the extra_body
|
||||||
extra_body[request.structure_type] = request.schema
|
extra_body[request.structure_type] = request.schema
|
||||||
# Add the specific structured_output_backend
|
|
||||||
extra_body["guided_decoding_backend"] = structured_output_backend
|
|
||||||
return extra_body
|
return extra_body
|
||||||
|
|
||||||
print("Starting initial single prompt test run...")
|
print("Starting initial single prompt test run...")
|
||||||
structured_output_req_idx = random.sample(
|
structured_output_req_idx = random.sample(
|
||||||
range(len(input_requests)),
|
range(len(input_requests)), int(len(input_requests) * structured_output_ratio)
|
||||||
int(len(input_requests) * structured_output_ratio))
|
)
|
||||||
|
|
||||||
test_request = input_requests[0]
|
test_request = input_requests[0]
|
||||||
test_req_extra_body = (prepare_extra_body(test_request)
|
test_req_extra_body = (
|
||||||
if 0 in structured_output_req_idx else None)
|
prepare_extra_body(test_request) if 0 in structured_output_req_idx else None
|
||||||
|
)
|
||||||
test_input = RequestFuncInput(
|
test_input = RequestFuncInput(
|
||||||
model=model_id,
|
model=model_id,
|
||||||
prompt=test_request.prompt,
|
prompt=test_request.prompt,
|
||||||
@ -448,7 +474,8 @@ async def benchmark(
|
|||||||
if not test_output.success:
|
if not test_output.success:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Initial test run failed - Please make sure benchmark arguments "
|
"Initial test run failed - Please make sure benchmark arguments "
|
||||||
f"are correctly specified. Error: {test_output.error}")
|
f"are correctly specified. Error: {test_output.error}"
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
print("Initial test run completed. Starting main benchmark run...")
|
print("Initial test run completed. Starting main benchmark run...")
|
||||||
|
|
||||||
@ -467,10 +494,7 @@ async def benchmark(
|
|||||||
if profile_output.success:
|
if profile_output.success:
|
||||||
print("Profiler started")
|
print("Profiler started")
|
||||||
|
|
||||||
if burstiness == 1.0:
|
distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution"
|
||||||
distribution = "Poisson process"
|
|
||||||
else:
|
|
||||||
distribution = "Gamma distribution"
|
|
||||||
|
|
||||||
print(f"Traffic request rate: {request_rate}")
|
print(f"Traffic request rate: {request_rate}")
|
||||||
print(f"Burstiness factor: {burstiness} ({distribution})")
|
print(f"Burstiness factor: {burstiness} ({distribution})")
|
||||||
@ -482,24 +506,21 @@ async def benchmark(
|
|||||||
# and it will simplify the code in limited_request_func.
|
# and it will simplify the code in limited_request_func.
|
||||||
# semaphore = (asyncio.Semaphore(max_concurrency)
|
# semaphore = (asyncio.Semaphore(max_concurrency)
|
||||||
# if max_concurrency else contextlib.nullcontext())
|
# if max_concurrency else contextlib.nullcontext())
|
||||||
semaphore = (asyncio.Semaphore(max_concurrency)
|
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
|
||||||
if max_concurrency else None)
|
|
||||||
|
|
||||||
async def limited_request_func(request_func_input, pbar):
|
async def limited_request_func(request_func_input, pbar):
|
||||||
if semaphore is None:
|
if semaphore is None:
|
||||||
return await request_func(request_func_input=request_func_input,
|
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
||||||
pbar=pbar)
|
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
return await request_func(request_func_input=request_func_input,
|
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
||||||
pbar=pbar)
|
|
||||||
|
|
||||||
benchmark_start_time = time.perf_counter()
|
benchmark_start_time = time.perf_counter()
|
||||||
tasks: list[asyncio.Task] = []
|
tasks: list[asyncio.Task] = []
|
||||||
expected: list[str] = []
|
expected: list[str] = []
|
||||||
async for i, request in get_request(input_requests, request_rate,
|
async for i, request in get_request(input_requests, request_rate, burstiness):
|
||||||
burstiness):
|
extra_body = (
|
||||||
extra_body = prepare_extra_body(
|
prepare_extra_body(request) if i in structured_output_req_idx else None
|
||||||
request) if i in structured_output_req_idx else None
|
)
|
||||||
request_func_input = RequestFuncInput(
|
request_func_input = RequestFuncInput(
|
||||||
model=model_id,
|
model=model_id,
|
||||||
prompt=request.prompt,
|
prompt=request.prompt,
|
||||||
@ -512,8 +533,9 @@ async def benchmark(
|
|||||||
expected.append(request.completion)
|
expected.append(request.completion)
|
||||||
tasks.append(
|
tasks.append(
|
||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
limited_request_func(request_func_input=request_func_input,
|
limited_request_func(request_func_input=request_func_input, pbar=pbar)
|
||||||
pbar=pbar)))
|
)
|
||||||
|
)
|
||||||
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
|
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
if profile:
|
if profile:
|
||||||
@ -545,54 +567,58 @@ async def benchmark(
|
|||||||
goodput_config_dict=goodput_config_dict,
|
goodput_config_dict=goodput_config_dict,
|
||||||
)
|
)
|
||||||
|
|
||||||
print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
|
print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
|
||||||
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
|
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
|
||||||
print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
|
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
|
||||||
benchmark_duration))
|
|
||||||
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
|
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
|
||||||
print("{:<40} {:<10}".format("Total generated tokens:",
|
print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
|
||||||
metrics.total_output))
|
print(
|
||||||
print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
|
"{:<40} {:<10.2f}".format(
|
||||||
metrics.request_throughput))
|
"Request throughput (req/s):", metrics.request_throughput
|
||||||
|
)
|
||||||
|
)
|
||||||
if goodput_config_dict:
|
if goodput_config_dict:
|
||||||
print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
|
print(
|
||||||
metrics.request_goodput))
|
"{:<40} {:<10.2f}".format(
|
||||||
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
|
"Request goodput (req/s):", metrics.request_goodput
|
||||||
metrics.output_throughput))
|
)
|
||||||
print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
|
)
|
||||||
metrics.total_token_throughput))
|
print(
|
||||||
|
"{:<40} {:<10.2f}".format(
|
||||||
|
"Output token throughput (tok/s):", metrics.output_throughput
|
||||||
|
)
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
"{:<40} {:<10.2f}".format(
|
||||||
|
"Total Token throughput (tok/s):", metrics.total_token_throughput
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
result = {
|
result = {
|
||||||
"duration":
|
"duration": benchmark_duration,
|
||||||
benchmark_duration,
|
"completed": metrics.completed,
|
||||||
"completed":
|
"total_input_tokens": metrics.total_input,
|
||||||
metrics.completed,
|
"total_output_tokens": metrics.total_output,
|
||||||
"total_input_tokens":
|
"request_throughput": metrics.request_throughput,
|
||||||
metrics.total_input,
|
"output_throughput": metrics.output_throughput,
|
||||||
"total_output_tokens":
|
"total_token_throughput": metrics.total_token_throughput,
|
||||||
metrics.total_output,
|
"ttft_description": pd.Series([output.ttft for output in outputs])
|
||||||
"request_throughput":
|
.describe()
|
||||||
metrics.request_throughput,
|
.to_dict(),
|
||||||
"output_throughput":
|
"tpot_description": pd.Series([output.tpot for output in outputs])
|
||||||
metrics.output_throughput,
|
.describe()
|
||||||
"total_token_throughput":
|
.to_dict(),
|
||||||
metrics.total_token_throughput,
|
|
||||||
"ttft_description":
|
|
||||||
pd.Series([output.ttft for output in outputs]).describe().to_dict(),
|
|
||||||
"tpot_description":
|
|
||||||
pd.Series([output.tpot for output in outputs]).describe().to_dict(),
|
|
||||||
"input_lens": [output.prompt_len for output in outputs],
|
"input_lens": [output.prompt_len for output in outputs],
|
||||||
"output_lens":
|
"output_lens": actual_output_lens,
|
||||||
actual_output_lens,
|
|
||||||
"ttfts": [output.ttft for output in outputs],
|
"ttfts": [output.ttft for output in outputs],
|
||||||
"itls": [output.itl for output in outputs],
|
"itls": [output.itl for output in outputs],
|
||||||
"errors": [output.error for output in outputs],
|
"errors": [output.error for output in outputs],
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = [{
|
ret = [
|
||||||
'generated': output.generated_text,
|
{"generated": output.generated_text, "expected": gt}
|
||||||
'expected': gt
|
for output, gt in zip(outputs, expected)
|
||||||
} for output, gt in zip(outputs, expected)]
|
]
|
||||||
|
|
||||||
def process_one_metric(
|
def process_one_metric(
|
||||||
# E.g., "ttft"
|
# E.g., "ttft"
|
||||||
@ -606,29 +632,35 @@ async def benchmark(
|
|||||||
# metric.
|
# metric.
|
||||||
if metric_attribute_name not in selected_percentile_metrics:
|
if metric_attribute_name not in selected_percentile_metrics:
|
||||||
return
|
return
|
||||||
print("{s:{c}^{n}}".format(s=metric_header, n=50, c='-'))
|
print("{s:{c}^{n}}".format(s=metric_header, n=50, c="-"))
|
||||||
print("{:<40} {:<10.2f}".format(
|
print(
|
||||||
f"Mean {metric_name} (ms):",
|
"{:<40} {:<10.2f}".format(
|
||||||
getattr(metrics, f"mean_{metric_attribute_name}_ms")))
|
f"Mean {metric_name} (ms):",
|
||||||
print("{:<40} {:<10.2f}".format(
|
getattr(metrics, f"mean_{metric_attribute_name}_ms"),
|
||||||
f"Median {metric_name} (ms):",
|
)
|
||||||
getattr(metrics, f"median_{metric_attribute_name}_ms")))
|
)
|
||||||
|
print(
|
||||||
|
"{:<40} {:<10.2f}".format(
|
||||||
|
f"Median {metric_name} (ms):",
|
||||||
|
getattr(metrics, f"median_{metric_attribute_name}_ms"),
|
||||||
|
)
|
||||||
|
)
|
||||||
result[f"mean_{metric_attribute_name}_ms"] = getattr(
|
result[f"mean_{metric_attribute_name}_ms"] = getattr(
|
||||||
metrics, f"mean_{metric_attribute_name}_ms")
|
metrics, f"mean_{metric_attribute_name}_ms"
|
||||||
|
)
|
||||||
result[f"median_{metric_attribute_name}_ms"] = getattr(
|
result[f"median_{metric_attribute_name}_ms"] = getattr(
|
||||||
metrics, f"median_{metric_attribute_name}_ms")
|
metrics, f"median_{metric_attribute_name}_ms"
|
||||||
|
)
|
||||||
result[f"std_{metric_attribute_name}_ms"] = getattr(
|
result[f"std_{metric_attribute_name}_ms"] = getattr(
|
||||||
metrics, f"std_{metric_attribute_name}_ms")
|
metrics, f"std_{metric_attribute_name}_ms"
|
||||||
for p, value in getattr(metrics,
|
)
|
||||||
f"percentiles_{metric_attribute_name}_ms"):
|
for p, value in getattr(metrics, f"percentiles_{metric_attribute_name}_ms"):
|
||||||
p_word = str(int(p)) if int(p) == p else str(p)
|
p_word = str(int(p)) if int(p) == p else str(p)
|
||||||
print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):",
|
print("{:<40} {:<10.2f}".format(f"P{p_word} {metric_name} (ms):", value))
|
||||||
value))
|
|
||||||
result[f"p{p_word}_{metric_attribute_name}_ms"] = value
|
result[f"p{p_word}_{metric_attribute_name}_ms"] = value
|
||||||
|
|
||||||
process_one_metric("ttft", "TTFT", "Time to First Token")
|
process_one_metric("ttft", "TTFT", "Time to First Token")
|
||||||
process_one_metric("tpot", "TPOT",
|
process_one_metric("tpot", "TPOT", "Time per Output Token (excl. 1st token)")
|
||||||
"Time per Output Token (excl. 1st token)")
|
|
||||||
process_one_metric("itl", "ITL", "Inter-token Latency")
|
process_one_metric("itl", "ITL", "Inter-token Latency")
|
||||||
process_one_metric("e2el", "E2EL", "End-to-end Latency")
|
process_one_metric("e2el", "E2EL", "End-to-end Latency")
|
||||||
|
|
||||||
@ -638,13 +670,13 @@ async def benchmark(
|
|||||||
|
|
||||||
|
|
||||||
def evaluate(ret, args):
|
def evaluate(ret, args):
|
||||||
|
|
||||||
def _eval_correctness_json(expected, actual):
|
def _eval_correctness_json(expected, actual):
|
||||||
# extract json string from string using regex
|
# extract json string from string using regex
|
||||||
import re
|
import regex as re
|
||||||
actual = actual.replace('\n', '').replace(' ', '').strip()
|
|
||||||
|
actual = actual.replace("\n", "").replace(" ", "").strip()
|
||||||
try:
|
try:
|
||||||
actual = re.search(r'\{.*\}', actual).group()
|
actual = re.search(r"\{.*\}", actual).group()
|
||||||
actual = json.loads(actual)
|
actual = json.loads(actual)
|
||||||
except Exception:
|
except Exception:
|
||||||
return False
|
return False
|
||||||
@ -655,29 +687,33 @@ def evaluate(ret, args):
|
|||||||
return actual in args.choice
|
return actual in args.choice
|
||||||
|
|
||||||
def _eval_correctness_regex(expected, actual):
|
def _eval_correctness_regex(expected, actual):
|
||||||
import re
|
import regex as re
|
||||||
|
|
||||||
return re.match(args.regex, actual) is not None
|
return re.match(args.regex, actual) is not None
|
||||||
|
|
||||||
def _eval_correctness(expected, actual):
|
def _eval_correctness(expected, actual):
|
||||||
if args.structure_type == 'guided_json':
|
if args.structure_type == "guided_json":
|
||||||
return _eval_correctness_json(expected, actual)
|
return _eval_correctness_json(expected, actual)
|
||||||
elif args.structure_type == 'guided_regex':
|
elif args.structure_type == "guided_regex":
|
||||||
return _eval_correctness_regex(expected, actual)
|
return _eval_correctness_regex(expected, actual)
|
||||||
elif args.structure_type == 'guided_choice':
|
elif args.structure_type == "guided_choice":
|
||||||
return _eval_correctness_choice(expected, actual)
|
return _eval_correctness_choice(expected, actual)
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
scores = []
|
scores = []
|
||||||
for res in ret:
|
for res in ret:
|
||||||
score = _eval_correctness(res['expected'], res['generated'])
|
score = _eval_correctness(res["expected"], res["generated"])
|
||||||
res['correctness'] = score
|
res["correctness"] = score
|
||||||
scores.append(score)
|
scores.append(score)
|
||||||
|
|
||||||
not_none_scores = [score for score in scores if score is not None]
|
not_none_scores = [score for score in scores if score is not None]
|
||||||
|
|
||||||
return (sum(not_none_scores) / len(not_none_scores) *
|
return (
|
||||||
100) if len(not_none_scores) > 0 else None
|
(sum(not_none_scores) / len(not_none_scores) * 100)
|
||||||
|
if len(not_none_scores) > 0
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def parse_goodput(slo_pairs):
|
def parse_goodput(slo_pairs):
|
||||||
@ -689,9 +725,10 @@ def parse_goodput(slo_pairs):
|
|||||||
except ValueError as err:
|
except ValueError as err:
|
||||||
raise argparse.ArgumentTypeError(
|
raise argparse.ArgumentTypeError(
|
||||||
"Invalid format found for service level objectives. "
|
"Invalid format found for service level objectives. "
|
||||||
"Specify service level objectives for goodput as \"KEY:VALUE\" "
|
'Specify service level objectives for goodput as "KEY:VALUE" '
|
||||||
"pairs, where the key is a metric name, and the value is a "
|
"pairs, where the key is a metric name, and the value is a "
|
||||||
"number in milliseconds.") from err
|
"number in milliseconds."
|
||||||
|
) from err
|
||||||
return goodput_config_dict
|
return goodput_config_dict
|
||||||
|
|
||||||
|
|
||||||
@ -705,12 +742,14 @@ def check_goodput_args(args):
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Invalid metric name found, {slo_name}: {slo_val}. "
|
f"Invalid metric name found, {slo_name}: {slo_val}. "
|
||||||
"The service level objective name should be one of "
|
"The service level objective name should be one of "
|
||||||
f"{str(VALID_NAMES)}. ")
|
f"{str(VALID_NAMES)}. "
|
||||||
|
)
|
||||||
if slo_val < 0:
|
if slo_val < 0:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Invalid value found, {slo_name}: {slo_val}. "
|
f"Invalid value found, {slo_name}: {slo_val}. "
|
||||||
"The service level objective value should be "
|
"The service level objective value should be "
|
||||||
"non-negative.")
|
"non-negative."
|
||||||
|
)
|
||||||
return goodput_config_dict
|
return goodput_config_dict
|
||||||
|
|
||||||
|
|
||||||
@ -736,19 +775,19 @@ def main(args: argparse.Namespace):
|
|||||||
tokenizer_mode=args.tokenizer_mode,
|
tokenizer_mode=args.tokenizer_mode,
|
||||||
)
|
)
|
||||||
|
|
||||||
if args.dataset == 'grammar':
|
if args.dataset == "grammar":
|
||||||
args.structure_type = 'guided_grammar'
|
args.structure_type = "guided_grammar"
|
||||||
elif args.dataset == 'regex':
|
elif args.dataset == "regex":
|
||||||
args.structure_type = 'guided_regex'
|
args.structure_type = "guided_regex"
|
||||||
elif args.dataset == 'choice':
|
elif args.dataset == "choice":
|
||||||
args.structure_type = 'guided_choice'
|
args.structure_type = "guided_choice"
|
||||||
else:
|
else:
|
||||||
args.structure_type = 'guided_json'
|
args.structure_type = "guided_json"
|
||||||
|
|
||||||
if args.no_structured_output:
|
if args.no_structured_output:
|
||||||
args.structured_output_ratio = 0
|
args.structured_output_ratio = 0
|
||||||
if args.save_results:
|
if args.save_results:
|
||||||
result_file_name = f'{args.structured_output_ratio}guided'
|
result_file_name = f"{args.structured_output_ratio}guided"
|
||||||
result_file_name += f"_{backend}"
|
result_file_name += f"_{backend}"
|
||||||
result_file_name += f"_{args.request_rate}qps"
|
result_file_name += f"_{args.request_rate}qps"
|
||||||
result_file_name += f"_{args.model.split('/')[-1]}"
|
result_file_name += f"_{args.model.split('/')[-1]}"
|
||||||
@ -776,37 +815,29 @@ def main(args: argparse.Namespace):
|
|||||||
disable_tqdm=args.disable_tqdm,
|
disable_tqdm=args.disable_tqdm,
|
||||||
profile=args.profile,
|
profile=args.profile,
|
||||||
selected_percentile_metrics=args.percentile_metrics.split(","),
|
selected_percentile_metrics=args.percentile_metrics.split(","),
|
||||||
selected_percentiles=[
|
selected_percentiles=[float(p) for p in args.metric_percentiles.split(",")],
|
||||||
float(p) for p in args.metric_percentiles.split(",")
|
|
||||||
],
|
|
||||||
ignore_eos=args.ignore_eos,
|
ignore_eos=args.ignore_eos,
|
||||||
max_concurrency=args.max_concurrency,
|
max_concurrency=args.max_concurrency,
|
||||||
structured_output_ratio=args.structured_output_ratio,
|
structured_output_ratio=args.structured_output_ratio,
|
||||||
structured_output_backend=args.structured_output_backend,
|
|
||||||
goodput_config_dict=goodput_config_dict,
|
goodput_config_dict=goodput_config_dict,
|
||||||
))
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# Save config and results to json
|
# Save config and results to json
|
||||||
score = evaluate(ret, args)
|
score = evaluate(ret, args)
|
||||||
print("correct_rate(%)", score, '\n')
|
print("correct_rate(%)", score, "\n")
|
||||||
if args.save_results:
|
if args.save_results:
|
||||||
results = {
|
results = {
|
||||||
"backend":
|
"backend": backend,
|
||||||
backend,
|
"model_id": model_id,
|
||||||
"model_id":
|
"tokenizer_id": tokenizer_id,
|
||||||
model_id,
|
"num_prompts": args.num_prompts,
|
||||||
"tokenizer_id":
|
"request_rate": args.request_rate
|
||||||
tokenizer_id,
|
if args.request_rate < float("inf")
|
||||||
"num_prompts":
|
else "inf",
|
||||||
args.num_prompts,
|
"burstiness": args.burstiness,
|
||||||
"request_rate":
|
"max_concurrency": args.max_concurrency,
|
||||||
args.request_rate if args.request_rate < float("inf") else "inf",
|
"correct_rate(%)": score,
|
||||||
"burstiness":
|
|
||||||
args.burstiness,
|
|
||||||
"max_concurrency":
|
|
||||||
args.max_concurrency,
|
|
||||||
"correct_rate(%)":
|
|
||||||
score
|
|
||||||
}
|
}
|
||||||
results = {"outputs": ret, **results, **benchmark_result}
|
results = {"outputs": ret, **results, **benchmark_result}
|
||||||
|
|
||||||
@ -815,13 +846,14 @@ def main(args: argparse.Namespace):
|
|||||||
result_file_name = args.result_filename
|
result_file_name = args.result_filename
|
||||||
if args.result_dir:
|
if args.result_dir:
|
||||||
result_file_name = os.path.join(args.result_dir, result_file_name)
|
result_file_name = os.path.join(args.result_dir, result_file_name)
|
||||||
with open(result_file_name, "w", encoding='utf-8') as outfile:
|
with open(result_file_name, "w", encoding="utf-8") as outfile:
|
||||||
json.dump(results, outfile, indent=4)
|
json.dump(results, outfile, indent=4)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
def create_argument_parser():
|
||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description="Benchmark the online serving throughput.")
|
description="Benchmark the online serving throughput."
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--backend",
|
"--backend",
|
||||||
type=str,
|
type=str,
|
||||||
@ -843,16 +875,14 @@ if __name__ == "__main__":
|
|||||||
default="/v1/completions",
|
default="/v1/completions",
|
||||||
help="API endpoint.",
|
help="API endpoint.",
|
||||||
)
|
)
|
||||||
parser.add_argument("--dataset",
|
parser.add_argument(
|
||||||
default='json',
|
"--dataset",
|
||||||
choices=[
|
default="json",
|
||||||
'json', 'json-unique', 'grammar', 'regex',
|
choices=["json", "json-unique", "grammar", "regex", "choice", "xgrammar_bench"],
|
||||||
'choice', 'xgrammar_bench'
|
)
|
||||||
])
|
parser.add_argument(
|
||||||
parser.add_argument("--json_schema_path",
|
"--json-schema-path", type=str, default=None, help="Path to json schema."
|
||||||
type=str,
|
)
|
||||||
default=None,
|
|
||||||
help="Path to json schema.")
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--max-concurrency",
|
"--max-concurrency",
|
||||||
type=int,
|
type=int,
|
||||||
@ -864,7 +894,8 @@ if __name__ == "__main__":
|
|||||||
"initiated, this argument will control how many are actually allowed "
|
"initiated, this argument will control how many are actually allowed "
|
||||||
"to execute at a time. This means that when used in combination, the "
|
"to execute at a time. This means that when used in combination, the "
|
||||||
"actual request rate may be lower than specified with --request-rate, "
|
"actual request rate may be lower than specified with --request-rate, "
|
||||||
"if the server is not processing requests fast enough to keep up.")
|
"if the server is not processing requests fast enough to keep up.",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--model",
|
"--model",
|
||||||
type=str,
|
type=str,
|
||||||
@ -874,15 +905,13 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--tokenizer",
|
"--tokenizer",
|
||||||
type=str,
|
type=str,
|
||||||
help=
|
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||||
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--tokenizer-mode",
|
"--tokenizer-mode",
|
||||||
type=str,
|
type=str,
|
||||||
default="auto",
|
default="auto",
|
||||||
help=
|
help="Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||||
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--num-prompts",
|
"--num-prompts",
|
||||||
@ -959,52 +988,56 @@ if __name__ == "__main__":
|
|||||||
"--ignore-eos",
|
"--ignore-eos",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help="Set ignore_eos flag when sending the benchmark request."
|
help="Set ignore_eos flag when sending the benchmark request."
|
||||||
"Warning: ignore_eos is not supported in deepspeed_mii and tgi.")
|
"Warning: ignore_eos is not supported in deepspeed_mii and tgi.",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--percentile-metrics",
|
"--percentile-metrics",
|
||||||
type=str,
|
type=str,
|
||||||
default="ttft,tpot,itl",
|
default="ttft,tpot,itl",
|
||||||
help="Comma-separated list of selected metrics to report percentils. "
|
help="Comma-separated list of selected metrics to report percentils. "
|
||||||
"This argument specifies the metrics to report percentiles. "
|
"This argument specifies the metrics to report percentiles. "
|
||||||
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
|
'Allowed metric names are "ttft", "tpot", "itl", "e2el". '
|
||||||
"Default value is \"ttft,tpot,itl\".")
|
'Default value is "ttft,tpot,itl".',
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--metric-percentiles",
|
"--metric-percentiles",
|
||||||
type=str,
|
type=str,
|
||||||
default="99",
|
default="99",
|
||||||
help="Comma-separated list of percentiles for selected metrics. "
|
help="Comma-separated list of percentiles for selected metrics. "
|
||||||
"To report 25-th, 50-th, and 75-th percentiles, use \"25,50,75\". "
|
'To report 25-th, 50-th, and 75-th percentiles, use "25,50,75". '
|
||||||
"Default value is \"99\". "
|
'Default value is "99". '
|
||||||
"Use \"--percentile-metrics\" to select metrics.",
|
'Use "--percentile-metrics" to select metrics.',
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--goodput",
|
"--goodput",
|
||||||
nargs="+",
|
nargs="+",
|
||||||
required=False,
|
required=False,
|
||||||
help="Specify service level objectives for goodput as \"KEY:VALUE\" "
|
help='Specify service level objectives for goodput as "KEY:VALUE" '
|
||||||
"pairs, where the key is a metric name, and the value is in "
|
"pairs, where the key is a metric name, and the value is in "
|
||||||
"milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
|
'milliseconds. Multiple "KEY:VALUE" pairs can be provided, '
|
||||||
"separated by spaces. Allowed request level metric names are "
|
"separated by spaces. Allowed request level metric names are "
|
||||||
"\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
|
'"ttft", "tpot", "e2el". For more context on the definition of '
|
||||||
"goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
|
"goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
|
||||||
"and the blog: https://hao-ai-lab.github.io/blogs/distserve")
|
"and the blog: https://hao-ai-lab.github.io/blogs/distserve",
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument("--no-structured-output",
|
parser.add_argument(
|
||||||
action='store_true',
|
"--no-structured-output",
|
||||||
default=False,
|
action="store_true",
|
||||||
help="Whether to disable JSON decoding or not.")
|
default=False,
|
||||||
parser.add_argument("--structured-output-ratio",
|
help="Whether to disable JSON decoding or not.",
|
||||||
type=float,
|
)
|
||||||
default=1.0,
|
parser.add_argument(
|
||||||
help="Ratio of Structured Outputs requests")
|
"--structured-output-ratio",
|
||||||
parser.add_argument("--structured-output-backend",
|
type=float,
|
||||||
type=str,
|
default=1.0,
|
||||||
choices=[
|
help="Ratio of Structured Outputs requests",
|
||||||
"outlines", "lm-format-enforcer", "xgrammar",
|
)
|
||||||
"guidance", "auto"
|
|
||||||
],
|
|
||||||
default="auto",
|
|
||||||
help="Backend to use for structured outputs")
|
|
||||||
|
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = create_argument_parser()
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
"""Benchmark offline inference throughput."""
|
"""Benchmark offline inference throughput."""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import dataclasses
|
import dataclasses
|
||||||
import json
|
import json
|
||||||
@ -11,18 +13,25 @@ from typing import Any, Optional, Union
|
|||||||
|
|
||||||
import torch
|
import torch
|
||||||
import uvloop
|
import uvloop
|
||||||
from benchmark_dataset import (AIMODataset, BurstGPTDataset,
|
|
||||||
ConversationDataset, InstructCoderDataset,
|
|
||||||
RandomDataset, SampleRequest, ShareGPTDataset,
|
|
||||||
SonnetDataset, VisionArenaDataset)
|
|
||||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
|
||||||
PreTrainedTokenizerBase)
|
|
||||||
|
|
||||||
|
from benchmark_dataset import (
|
||||||
|
AIMODataset,
|
||||||
|
BurstGPTDataset,
|
||||||
|
ConversationDataset,
|
||||||
|
InstructCoderDataset,
|
||||||
|
RandomDataset,
|
||||||
|
SampleRequest,
|
||||||
|
ShareGPTDataset,
|
||||||
|
SonnetDataset,
|
||||||
|
VisionArenaDataset,
|
||||||
|
)
|
||||||
|
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
||||||
from vllm.entrypoints.openai.api_server import (
|
from vllm.entrypoints.openai.api_server import (
|
||||||
build_async_engine_client_from_engine_args)
|
build_async_engine_client_from_engine_args,
|
||||||
|
)
|
||||||
from vllm.inputs import TextPrompt, TokensPrompt
|
from vllm.inputs import TextPrompt, TokensPrompt
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.outputs import RequestOutput
|
from vllm.outputs import RequestOutput
|
||||||
@ -37,23 +46,30 @@ def run_vllm(
|
|||||||
disable_detokenize: bool = False,
|
disable_detokenize: bool = False,
|
||||||
) -> tuple[float, Optional[list[RequestOutput]]]:
|
) -> tuple[float, Optional[list[RequestOutput]]]:
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
llm = LLM(**dataclasses.asdict(engine_args))
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
assert all(
|
assert all(
|
||||||
llm.llm_engine.model_config.max_model_len >= (
|
llm.llm_engine.model_config.max_model_len
|
||||||
request.prompt_len + request.expected_output_len)
|
>= (request.prompt_len + request.expected_output_len)
|
||||||
for request in requests), (
|
for request in requests
|
||||||
"Please ensure that max_model_len is greater than the sum of"
|
), (
|
||||||
" prompt_len and expected_output_len for all requests.")
|
"Please ensure that max_model_len is greater than the sum of"
|
||||||
|
" prompt_len and expected_output_len for all requests."
|
||||||
|
)
|
||||||
# Add the requests to the engine.
|
# Add the requests to the engine.
|
||||||
prompts: list[Union[TextPrompt, TokensPrompt]] = []
|
prompts: list[Union[TextPrompt, TokensPrompt]] = []
|
||||||
sampling_params: list[SamplingParams] = []
|
sampling_params: list[SamplingParams] = []
|
||||||
for request in requests:
|
for request in requests:
|
||||||
prompts.append(
|
prompts.append(
|
||||||
TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
|
TokensPrompt(
|
||||||
multi_modal_data=request.multi_modal_data)
|
prompt_token_ids=request.prompt["prompt_token_ids"],
|
||||||
if "prompt_token_ids" in request.prompt else \
|
multi_modal_data=request.multi_modal_data,
|
||||||
TextPrompt(prompt=request.prompt,
|
)
|
||||||
multi_modal_data=request.multi_modal_data))
|
if "prompt_token_ids" in request.prompt
|
||||||
|
else TextPrompt(
|
||||||
|
prompt=request.prompt, multi_modal_data=request.multi_modal_data
|
||||||
|
)
|
||||||
|
)
|
||||||
sampling_params.append(
|
sampling_params.append(
|
||||||
SamplingParams(
|
SamplingParams(
|
||||||
n=n,
|
n=n,
|
||||||
@ -62,7 +78,8 @@ def run_vllm(
|
|||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
max_tokens=request.expected_output_len,
|
max_tokens=request.expected_output_len,
|
||||||
detokenize=not disable_detokenize,
|
detokenize=not disable_detokenize,
|
||||||
))
|
)
|
||||||
|
)
|
||||||
lora_requests: Optional[list[LoRARequest]] = None
|
lora_requests: Optional[list[LoRARequest]] = None
|
||||||
if engine_args.enable_lora:
|
if engine_args.enable_lora:
|
||||||
lora_requests = [request.lora_request for request in requests]
|
lora_requests = [request.lora_request for request in requests]
|
||||||
@ -72,16 +89,15 @@ def run_vllm(
|
|||||||
outputs = None
|
outputs = None
|
||||||
if not use_beam_search:
|
if not use_beam_search:
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
outputs = llm.generate(prompts,
|
outputs = llm.generate(
|
||||||
sampling_params,
|
prompts, sampling_params, lora_request=lora_requests, use_tqdm=True
|
||||||
lora_request=lora_requests,
|
)
|
||||||
use_tqdm=True)
|
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
else:
|
else:
|
||||||
assert lora_requests is None, "BeamSearch API does not support LoRA"
|
assert lora_requests is None, "BeamSearch API does not support LoRA"
|
||||||
prompts = [request.prompt for request in requests]
|
prompts = [request.prompt for request in requests]
|
||||||
# output_len should be the same for all requests.
|
# output_len should be the same for all requests.
|
||||||
output_len = requests[0][2]
|
output_len = requests[0].expected_output_len
|
||||||
for request in requests:
|
for request in requests:
|
||||||
assert request.expected_output_len == output_len
|
assert request.expected_output_len == output_len
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
@ -91,30 +107,35 @@ def run_vllm(
|
|||||||
beam_width=n,
|
beam_width=n,
|
||||||
max_tokens=output_len,
|
max_tokens=output_len,
|
||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
))
|
),
|
||||||
|
)
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
return end - start, outputs
|
return end - start, outputs
|
||||||
|
|
||||||
|
|
||||||
def run_vllm_chat(
|
def run_vllm_chat(
|
||||||
requests: list[SampleRequest],
|
requests: list[SampleRequest],
|
||||||
n: int,
|
n: int,
|
||||||
engine_args: EngineArgs,
|
engine_args: EngineArgs,
|
||||||
disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]:
|
disable_detokenize: bool = False,
|
||||||
|
) -> tuple[float, list[RequestOutput]]:
|
||||||
"""
|
"""
|
||||||
Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
|
Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
|
||||||
multimodal models as it properly handles multimodal inputs and chat
|
multimodal models as it properly handles multimodal inputs and chat
|
||||||
formatting. For non-multimodal models, use run_vllm() instead.
|
formatting. For non-multimodal models, use run_vllm() instead.
|
||||||
"""
|
"""
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
llm = LLM(**dataclasses.asdict(engine_args))
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
|
|
||||||
assert all(
|
assert all(
|
||||||
llm.llm_engine.model_config.max_model_len >= (
|
llm.llm_engine.model_config.max_model_len
|
||||||
request.prompt_len + request.expected_output_len)
|
>= (request.prompt_len + request.expected_output_len)
|
||||||
for request in requests), (
|
for request in requests
|
||||||
"Please ensure that max_model_len is greater than the sum of "
|
), (
|
||||||
"prompt_len and expected_output_len for all requests.")
|
"Please ensure that max_model_len is greater than the sum of "
|
||||||
|
"prompt_len and expected_output_len for all requests."
|
||||||
|
)
|
||||||
|
|
||||||
prompts = []
|
prompts = []
|
||||||
sampling_params: list[SamplingParams] = []
|
sampling_params: list[SamplingParams] = []
|
||||||
@ -128,7 +149,8 @@ def run_vllm_chat(
|
|||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
max_tokens=request.expected_output_len,
|
max_tokens=request.expected_output_len,
|
||||||
detokenize=not disable_detokenize,
|
detokenize=not disable_detokenize,
|
||||||
))
|
)
|
||||||
|
)
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
|
outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
@ -145,13 +167,17 @@ async def run_vllm_async(
|
|||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
|
|
||||||
async with build_async_engine_client_from_engine_args(
|
async with build_async_engine_client_from_engine_args(
|
||||||
engine_args, disable_frontend_multiprocessing) as llm:
|
engine_args, disable_frontend_multiprocessing
|
||||||
|
) as llm:
|
||||||
|
model_config = await llm.get_model_config()
|
||||||
assert all(
|
assert all(
|
||||||
llm.model_config.max_model_len >= (request.prompt_len +
|
model_config.max_model_len
|
||||||
request.expected_output_len)
|
>= (request.prompt_len + request.expected_output_len)
|
||||||
for request in requests), (
|
for request in requests
|
||||||
"Please ensure that max_model_len is greater than the sum of"
|
), (
|
||||||
" prompt_len and expected_output_len for all requests.")
|
"Please ensure that max_model_len is greater than the sum of"
|
||||||
|
" prompt_len and expected_output_len for all requests."
|
||||||
|
)
|
||||||
|
|
||||||
# Add the requests to the engine.
|
# Add the requests to the engine.
|
||||||
prompts: list[Union[TextPrompt, TokensPrompt]] = []
|
prompts: list[Union[TextPrompt, TokensPrompt]] = []
|
||||||
@ -159,11 +185,15 @@ async def run_vllm_async(
|
|||||||
lora_requests: list[Optional[LoRARequest]] = []
|
lora_requests: list[Optional[LoRARequest]] = []
|
||||||
for request in requests:
|
for request in requests:
|
||||||
prompts.append(
|
prompts.append(
|
||||||
TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
|
TokensPrompt(
|
||||||
multi_modal_data=request.multi_modal_data)
|
prompt_token_ids=request.prompt["prompt_token_ids"],
|
||||||
if "prompt_token_ids" in request.prompt else \
|
multi_modal_data=request.multi_modal_data,
|
||||||
TextPrompt(prompt=request.prompt,
|
)
|
||||||
multi_modal_data=request.multi_modal_data))
|
if "prompt_token_ids" in request.prompt
|
||||||
|
else TextPrompt(
|
||||||
|
prompt=request.prompt, multi_modal_data=request.multi_modal_data
|
||||||
|
)
|
||||||
|
)
|
||||||
sampling_params.append(
|
sampling_params.append(
|
||||||
SamplingParams(
|
SamplingParams(
|
||||||
n=n,
|
n=n,
|
||||||
@ -172,17 +202,16 @@ async def run_vllm_async(
|
|||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
max_tokens=request.expected_output_len,
|
max_tokens=request.expected_output_len,
|
||||||
detokenize=not disable_detokenize,
|
detokenize=not disable_detokenize,
|
||||||
))
|
)
|
||||||
|
)
|
||||||
lora_requests.append(request.lora_request)
|
lora_requests.append(request.lora_request)
|
||||||
|
|
||||||
generators = []
|
generators = []
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
for i, (prompt, sp,
|
for i, (prompt, sp, lr) in enumerate(
|
||||||
lr) in enumerate(zip(prompts, sampling_params, lora_requests)):
|
zip(prompts, sampling_params, lora_requests)
|
||||||
generator = llm.generate(prompt,
|
):
|
||||||
sp,
|
generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}")
|
||||||
lora_request=lr,
|
|
||||||
request_id=f"test{i}")
|
|
||||||
generators.append(generator)
|
generators.append(generator)
|
||||||
all_gens = merge_async_iterators(*generators)
|
all_gens = merge_async_iterators(*generators)
|
||||||
async for i, res in all_gens:
|
async for i, res in all_gens:
|
||||||
@ -201,7 +230,8 @@ def run_hf(
|
|||||||
disable_detokenize: bool = False,
|
disable_detokenize: bool = False,
|
||||||
) -> float:
|
) -> float:
|
||||||
llm = AutoModelForCausalLM.from_pretrained(
|
llm = AutoModelForCausalLM.from_pretrained(
|
||||||
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
|
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
|
||||||
|
)
|
||||||
if llm.config.model_type == "llama":
|
if llm.config.model_type == "llama":
|
||||||
# To enable padding in the HF backend.
|
# To enable padding in the HF backend.
|
||||||
tokenizer.pad_token = tokenizer.eos_token
|
tokenizer.pad_token = tokenizer.eos_token
|
||||||
@ -224,14 +254,15 @@ def run_hf(
|
|||||||
# Check if we can add more requests to the batch.
|
# Check if we can add more requests to the batch.
|
||||||
next_prompt_len = requests[i + 1].prompt_len
|
next_prompt_len = requests[i + 1].prompt_len
|
||||||
next_output_len = requests[i + 1].expected_output_len
|
next_output_len = requests[i + 1].expected_output_len
|
||||||
if (max(max_prompt_len, next_prompt_len) +
|
if (
|
||||||
max(max_output_len, next_output_len)) <= 2048:
|
max(max_prompt_len, next_prompt_len)
|
||||||
|
+ max(max_output_len, next_output_len)
|
||||||
|
) <= 2048:
|
||||||
# We can add more requests to the batch.
|
# We can add more requests to the batch.
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Generate the sequences.
|
# Generate the sequences.
|
||||||
input_ids = tokenizer(batch, return_tensors="pt",
|
input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids
|
||||||
padding=True).input_ids
|
|
||||||
llm_outputs = llm.generate(
|
llm_outputs = llm.generate(
|
||||||
input_ids=input_ids.cuda(),
|
input_ids=input_ids.cuda(),
|
||||||
do_sample=True,
|
do_sample=True,
|
||||||
@ -261,6 +292,7 @@ def run_mii(
|
|||||||
output_len: int,
|
output_len: int,
|
||||||
) -> float:
|
) -> float:
|
||||||
from mii import client, serve
|
from mii import client, serve
|
||||||
|
|
||||||
llm = serve(model, tensor_parallel=tensor_parallel_size)
|
llm = serve(model, tensor_parallel=tensor_parallel_size)
|
||||||
prompts = [request.prompt for request in requests]
|
prompts = [request.prompt for request in requests]
|
||||||
|
|
||||||
@ -272,8 +304,9 @@ def run_mii(
|
|||||||
return end - start
|
return end - start
|
||||||
|
|
||||||
|
|
||||||
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
|
def save_to_pytorch_benchmark_format(
|
||||||
results: dict[str, Any]) -> None:
|
args: argparse.Namespace, results: dict[str, Any]
|
||||||
|
) -> None:
|
||||||
pt_records = convert_to_pytorch_benchmark_format(
|
pt_records = convert_to_pytorch_benchmark_format(
|
||||||
args=args,
|
args=args,
|
||||||
metrics={
|
metrics={
|
||||||
@ -281,9 +314,9 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
|
|||||||
"tokens_per_second": [results["tokens_per_second"]],
|
"tokens_per_second": [results["tokens_per_second"]],
|
||||||
},
|
},
|
||||||
extra_info={
|
extra_info={
|
||||||
k: results[k]
|
k: results[k] for k in ["elapsed_time", "num_requests", "total_num_tokens"]
|
||||||
for k in ["elapsed_time", "num_requests", "total_num_tokens"]
|
},
|
||||||
})
|
)
|
||||||
if pt_records:
|
if pt_records:
|
||||||
# Don't use json suffix here as we don't want CI to pick it up
|
# Don't use json suffix here as we don't want CI to pick it up
|
||||||
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
|
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
|
||||||
@ -315,30 +348,32 @@ def get_requests(args, tokenizer):
|
|||||||
sample_kwargs["enable_multimodal_chat"] = True
|
sample_kwargs["enable_multimodal_chat"] = True
|
||||||
elif args.dataset_name == "sonnet":
|
elif args.dataset_name == "sonnet":
|
||||||
assert tokenizer.chat_template or tokenizer.default_chat_template, (
|
assert tokenizer.chat_template or tokenizer.default_chat_template, (
|
||||||
"Tokenizer/model must have chat template for sonnet dataset.")
|
"Tokenizer/model must have chat template for sonnet dataset."
|
||||||
|
)
|
||||||
dataset_cls = SonnetDataset
|
dataset_cls = SonnetDataset
|
||||||
sample_kwargs["prefix_len"] = args.prefix_len
|
sample_kwargs["prefix_len"] = args.prefix_len
|
||||||
sample_kwargs["return_prompt_formatted"] = True
|
sample_kwargs["return_prompt_formatted"] = True
|
||||||
elif args.dataset_name == "burstgpt":
|
elif args.dataset_name == "burstgpt":
|
||||||
dataset_cls = BurstGPTDataset
|
dataset_cls = BurstGPTDataset
|
||||||
elif args.dataset_name == "hf":
|
elif args.dataset_name == "hf":
|
||||||
|
common_kwargs["no_stream"] = args.no_stream
|
||||||
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
|
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
|
||||||
dataset_cls = VisionArenaDataset
|
dataset_cls = VisionArenaDataset
|
||||||
common_kwargs['dataset_subset'] = None
|
common_kwargs["dataset_subset"] = None
|
||||||
common_kwargs['dataset_split'] = "train"
|
common_kwargs["dataset_split"] = "train"
|
||||||
sample_kwargs["enable_multimodal_chat"] = True
|
sample_kwargs["enable_multimodal_chat"] = True
|
||||||
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
|
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
|
||||||
dataset_cls = InstructCoderDataset
|
dataset_cls = InstructCoderDataset
|
||||||
common_kwargs['dataset_split'] = "train"
|
common_kwargs["dataset_split"] = "train"
|
||||||
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
|
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
|
||||||
dataset_cls = ConversationDataset
|
dataset_cls = ConversationDataset
|
||||||
common_kwargs['dataset_subset'] = args.hf_subset
|
common_kwargs["dataset_subset"] = args.hf_subset
|
||||||
common_kwargs['dataset_split'] = args.hf_split
|
common_kwargs["dataset_split"] = args.hf_split
|
||||||
sample_kwargs["enable_multimodal_chat"] = True
|
sample_kwargs["enable_multimodal_chat"] = True
|
||||||
elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
|
elif args.dataset_path in AIMODataset.SUPPORTED_DATASET_PATHS:
|
||||||
dataset_cls = AIMODataset
|
dataset_cls = AIMODataset
|
||||||
common_kwargs['dataset_subset'] = None
|
common_kwargs["dataset_subset"] = None
|
||||||
common_kwargs['dataset_split'] = "train"
|
common_kwargs["dataset_split"] = "train"
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown dataset name: {args.dataset_name}")
|
raise ValueError(f"Unknown dataset name: {args.dataset_name}")
|
||||||
# Remove None values
|
# Remove None values
|
||||||
@ -353,10 +388,10 @@ def main(args: argparse.Namespace):
|
|||||||
random.seed(args.seed)
|
random.seed(args.seed)
|
||||||
# Sample the requests.
|
# Sample the requests.
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
args.tokenizer, trust_remote_code=args.trust_remote_code)
|
args.tokenizer, trust_remote_code=args.trust_remote_code
|
||||||
|
)
|
||||||
requests = get_requests(args, tokenizer)
|
requests = get_requests(args, tokenizer)
|
||||||
is_multi_modal = any(request.multi_modal_data is not None
|
is_multi_modal = any(request.multi_modal_data is not None for request in requests)
|
||||||
for request in requests)
|
|
||||||
request_outputs: Optional[list[RequestOutput]] = None
|
request_outputs: Optional[list[RequestOutput]] = None
|
||||||
if args.backend == "vllm":
|
if args.backend == "vllm":
|
||||||
if args.async_engine:
|
if args.async_engine:
|
||||||
@ -367,23 +402,34 @@ def main(args: argparse.Namespace):
|
|||||||
AsyncEngineArgs.from_cli_args(args),
|
AsyncEngineArgs.from_cli_args(args),
|
||||||
args.disable_frontend_multiprocessing,
|
args.disable_frontend_multiprocessing,
|
||||||
args.disable_detokenize,
|
args.disable_detokenize,
|
||||||
))
|
)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
elapsed_time, request_outputs = run_vllm(
|
elapsed_time, request_outputs = run_vllm(
|
||||||
requests, args.n, EngineArgs.from_cli_args(args),
|
requests,
|
||||||
args.disable_detokenize)
|
args.n,
|
||||||
|
EngineArgs.from_cli_args(args),
|
||||||
|
args.disable_detokenize,
|
||||||
|
)
|
||||||
elif args.backend == "hf":
|
elif args.backend == "hf":
|
||||||
assert args.tensor_parallel_size == 1
|
assert args.tensor_parallel_size == 1
|
||||||
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
elapsed_time = run_hf(
|
||||||
args.hf_max_batch_size, args.trust_remote_code,
|
requests,
|
||||||
args.disable_detokenize)
|
args.model,
|
||||||
|
tokenizer,
|
||||||
|
args.n,
|
||||||
|
args.hf_max_batch_size,
|
||||||
|
args.trust_remote_code,
|
||||||
|
args.disable_detokenize,
|
||||||
|
)
|
||||||
elif args.backend == "mii":
|
elif args.backend == "mii":
|
||||||
elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
|
elapsed_time = run_mii(
|
||||||
args.output_len)
|
requests, args.model, args.tensor_parallel_size, args.output_len
|
||||||
|
)
|
||||||
elif args.backend == "vllm-chat":
|
elif args.backend == "vllm-chat":
|
||||||
elapsed_time, request_outputs = run_vllm_chat(
|
elapsed_time, request_outputs = run_vllm_chat(
|
||||||
requests, args.n, EngineArgs.from_cli_args(args),
|
requests, args.n, EngineArgs.from_cli_args(args), args.disable_detokenize
|
||||||
args.disable_detokenize)
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown backend: {args.backend}")
|
raise ValueError(f"Unknown backend: {args.backend}")
|
||||||
|
|
||||||
@ -395,28 +441,31 @@ def main(args: argparse.Namespace):
|
|||||||
for ro in request_outputs:
|
for ro in request_outputs:
|
||||||
if not isinstance(ro, RequestOutput):
|
if not isinstance(ro, RequestOutput):
|
||||||
continue
|
continue
|
||||||
total_prompt_tokens += len(
|
total_prompt_tokens += (
|
||||||
ro.prompt_token_ids) if ro.prompt_token_ids else 0
|
len(ro.prompt_token_ids) if ro.prompt_token_ids else 0
|
||||||
total_output_tokens += sum(
|
)
|
||||||
len(o.token_ids) for o in ro.outputs if o)
|
total_output_tokens += sum(len(o.token_ids) for o in ro.outputs if o)
|
||||||
total_num_tokens = total_prompt_tokens + total_output_tokens
|
total_num_tokens = total_prompt_tokens + total_output_tokens
|
||||||
else:
|
else:
|
||||||
total_num_tokens = sum(r.prompt_len + r.expected_output_len
|
total_num_tokens = sum(r.prompt_len + r.expected_output_len for r in requests)
|
||||||
for r in requests)
|
|
||||||
total_output_tokens = sum(r.expected_output_len for r in requests)
|
total_output_tokens = sum(r.expected_output_len for r in requests)
|
||||||
total_prompt_tokens = total_num_tokens - total_output_tokens
|
total_prompt_tokens = total_num_tokens - total_output_tokens
|
||||||
|
|
||||||
if is_multi_modal and args.backend != "vllm-chat":
|
if is_multi_modal and args.backend != "vllm-chat":
|
||||||
print("\033[91mWARNING\033[0m: Multi-modal request with "
|
print(
|
||||||
f"{args.backend} backend detected. The "
|
"\033[91mWARNING\033[0m: Multi-modal request with "
|
||||||
"following metrics are not accurate because image tokens are not"
|
f"{args.backend} backend detected. The "
|
||||||
" counted. See vllm-project/vllm/issues/9778 for details.")
|
"following metrics are not accurate because image tokens are not"
|
||||||
|
" counted. See vllm-project/vllm/issues/9778 for details."
|
||||||
|
)
|
||||||
# TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
|
# TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
|
||||||
# vllm-chat backend counts the image tokens now
|
# vllm-chat backend counts the image tokens now
|
||||||
|
|
||||||
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
print(
|
||||||
f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
|
f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
||||||
f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
|
f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
|
||||||
|
f"{total_output_tokens / elapsed_time:.2f} output tokens/s"
|
||||||
|
)
|
||||||
print(f"Total num prompt tokens: {total_prompt_tokens}")
|
print(f"Total num prompt tokens: {total_prompt_tokens}")
|
||||||
print(f"Total num output tokens: {total_output_tokens}")
|
print(f"Total num output tokens: {total_output_tokens}")
|
||||||
|
|
||||||
@ -444,7 +493,8 @@ def validate_args(args):
|
|||||||
warnings.warn(
|
warnings.warn(
|
||||||
"The '--dataset' argument will be deprecated in the next release. "
|
"The '--dataset' argument will be deprecated in the next release. "
|
||||||
"Please use '--dataset-name' and '--dataset-path' instead.",
|
"Please use '--dataset-name' and '--dataset-path' instead.",
|
||||||
stacklevel=2)
|
stacklevel=2,
|
||||||
|
)
|
||||||
args.dataset_path = args.dataset
|
args.dataset_path = args.dataset
|
||||||
|
|
||||||
if not getattr(args, "tokenizer", None):
|
if not getattr(args, "tokenizer", None):
|
||||||
@ -457,9 +507,8 @@ def validate_args(args):
|
|||||||
|
|
||||||
# === Dataset Configuration ===
|
# === Dataset Configuration ===
|
||||||
if not args.dataset and not args.dataset_path:
|
if not args.dataset and not args.dataset_path:
|
||||||
print(
|
print("When dataset path is not set, it will default to random dataset")
|
||||||
"When dataset path is not set, it will default to random dataset")
|
args.dataset_name = "random"
|
||||||
args.dataset_name = 'random'
|
|
||||||
if args.input_len is None:
|
if args.input_len is None:
|
||||||
raise ValueError("input_len must be provided for a random dataset")
|
raise ValueError("input_len must be provided for a random dataset")
|
||||||
|
|
||||||
@ -467,41 +516,55 @@ def validate_args(args):
|
|||||||
# --hf-subset and --hf-split: only used
|
# --hf-subset and --hf-split: only used
|
||||||
# when dataset_name is 'hf'
|
# when dataset_name is 'hf'
|
||||||
if args.dataset_name != "hf" and (
|
if args.dataset_name != "hf" and (
|
||||||
getattr(args, "hf_subset", None) is not None
|
getattr(args, "hf_subset", None) is not None
|
||||||
or getattr(args, "hf_split", None) is not None):
|
or getattr(args, "hf_split", None) is not None
|
||||||
warnings.warn("--hf-subset and --hf-split will be ignored \
|
):
|
||||||
|
warnings.warn(
|
||||||
|
"--hf-subset and --hf-split will be ignored \
|
||||||
since --dataset-name is not 'hf'.",
|
since --dataset-name is not 'hf'.",
|
||||||
stacklevel=2)
|
stacklevel=2,
|
||||||
|
)
|
||||||
elif args.dataset_name == "hf":
|
elif args.dataset_name == "hf":
|
||||||
if args.dataset_path in (
|
if args.dataset_path in (
|
||||||
VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
|
VisionArenaDataset.SUPPORTED_DATASET_PATHS.keys()
|
||||||
| ConversationDataset.SUPPORTED_DATASET_PATHS):
|
| ConversationDataset.SUPPORTED_DATASET_PATHS
|
||||||
assert args.backend == "vllm-chat", f"{args.dataset_path} needs to use vllm-chat as the backend." #noqa: E501
|
):
|
||||||
elif args.dataset_path in (InstructCoderDataset.SUPPORTED_DATASET_PATHS
|
assert args.backend == "vllm-chat", (
|
||||||
| AIMODataset.SUPPORTED_DATASET_PATHS):
|
f"{args.dataset_path} needs to use vllm-chat as the backend."
|
||||||
assert args.backend == "vllm", f"{args.dataset_path} needs to use vllm as the backend." #noqa: E501
|
) # noqa: E501
|
||||||
|
elif args.dataset_path in (
|
||||||
|
InstructCoderDataset.SUPPORTED_DATASET_PATHS
|
||||||
|
| AIMODataset.SUPPORTED_DATASET_PATHS
|
||||||
|
):
|
||||||
|
assert args.backend == "vllm", (
|
||||||
|
f"{args.dataset_path} needs to use vllm as the backend."
|
||||||
|
) # noqa: E501
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(f"{args.dataset_path} is not supported by hf dataset.")
|
||||||
f"{args.dataset_path} is not supported by hf dataset.")
|
|
||||||
|
|
||||||
# --random-range-ratio: only used when dataset_name is 'random'
|
# --random-range-ratio: only used when dataset_name is 'random'
|
||||||
if args.dataset_name != 'random' and args.random_range_ratio is not None:
|
if args.dataset_name != "random" and args.random_range_ratio is not None:
|
||||||
warnings.warn("--random-range-ratio will be ignored since \
|
warnings.warn(
|
||||||
|
"--random-range-ratio will be ignored since \
|
||||||
--dataset-name is not 'random'.",
|
--dataset-name is not 'random'.",
|
||||||
stacklevel=2)
|
stacklevel=2,
|
||||||
|
)
|
||||||
|
|
||||||
# --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
|
# --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
|
||||||
# set.
|
# set.
|
||||||
if args.dataset_name not in {"random", "sonnet", None
|
if (
|
||||||
} and args.prefix_len is not None:
|
args.dataset_name not in {"random", "sonnet", None}
|
||||||
warnings.warn("--prefix-len will be ignored since --dataset-name\
|
and args.prefix_len is not None
|
||||||
|
):
|
||||||
|
warnings.warn(
|
||||||
|
"--prefix-len will be ignored since --dataset-name\
|
||||||
is not 'random', 'sonnet', or not set.",
|
is not 'random', 'sonnet', or not set.",
|
||||||
stacklevel=2)
|
stacklevel=2,
|
||||||
|
)
|
||||||
|
|
||||||
# === LoRA Settings ===
|
# === LoRA Settings ===
|
||||||
if getattr(args, "enable_lora", False) and args.backend != "vllm":
|
if getattr(args, "enable_lora", False) and args.backend != "vllm":
|
||||||
raise ValueError(
|
raise ValueError("LoRA benchmarking is only supported for vLLM backend")
|
||||||
"LoRA benchmarking is only supported for vLLM backend")
|
|
||||||
if getattr(args, "enable_lora", False) and args.lora_path is None:
|
if getattr(args, "enable_lora", False) and args.lora_path is None:
|
||||||
raise ValueError("LoRA path must be provided when enable_lora is True")
|
raise ValueError("LoRA path must be provided when enable_lora is True")
|
||||||
|
|
||||||
@ -511,8 +574,10 @@ def validate_args(args):
|
|||||||
if args.backend != "hf" and args.hf_max_batch_size is not None:
|
if args.backend != "hf" and args.hf_max_batch_size is not None:
|
||||||
raise ValueError("HF max batch size is only for HF backend.")
|
raise ValueError("HF max batch size is only for HF backend.")
|
||||||
|
|
||||||
if args.backend in {"hf", "mii"} and getattr(args, "quantization",
|
if (
|
||||||
None) is not None:
|
args.backend in {"hf", "mii"}
|
||||||
|
and getattr(args, "quantization", None) is not None
|
||||||
|
):
|
||||||
raise ValueError("Quantization is only for vLLM backend.")
|
raise ValueError("Quantization is only for vLLM backend.")
|
||||||
|
|
||||||
if args.backend == "mii" and args.dtype != "auto":
|
if args.backend == "mii" and args.dtype != "auto":
|
||||||
@ -520,29 +585,37 @@ def validate_args(args):
|
|||||||
if args.backend == "mii" and args.n != 1:
|
if args.backend == "mii" and args.n != 1:
|
||||||
raise ValueError("n must be 1 for MII backend.")
|
raise ValueError("n must be 1 for MII backend.")
|
||||||
if args.backend == "mii" and args.tokenizer != args.model:
|
if args.backend == "mii" and args.tokenizer != args.model:
|
||||||
raise ValueError(
|
raise ValueError("Tokenizer must be the same as the model for MII backend.")
|
||||||
"Tokenizer must be the same as the model for MII backend.")
|
|
||||||
|
|
||||||
# --data-parallel is not supported currently.
|
# --data-parallel is not supported currently.
|
||||||
# https://github.com/vllm-project/vllm/issues/16222
|
# https://github.com/vllm-project/vllm/issues/16222
|
||||||
if args.data_parallel_size > 1:
|
if args.data_parallel_size > 1:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Data parallel is not supported in offline benchmark, \
|
"Data parallel is not supported in offline benchmark, \
|
||||||
please use benchmark serving instead")
|
please use benchmark serving instead"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
def create_argument_parser():
|
||||||
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
|
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
|
||||||
parser.add_argument("--backend",
|
parser.add_argument(
|
||||||
type=str,
|
"--backend",
|
||||||
choices=["vllm", "hf", "mii", "vllm-chat"],
|
type=str,
|
||||||
default="vllm")
|
choices=["vllm", "hf", "mii", "vllm-chat"],
|
||||||
|
default="vllm",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--dataset-name",
|
"--dataset-name",
|
||||||
type=str,
|
type=str,
|
||||||
choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
|
choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
|
||||||
help="Name of the dataset to benchmark on.",
|
help="Name of the dataset to benchmark on.",
|
||||||
default="sharegpt")
|
default="sharegpt",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-stream",
|
||||||
|
action="store_true",
|
||||||
|
help="Do not load the dataset in streaming mode.",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--dataset",
|
"--dataset",
|
||||||
type=str,
|
type=str,
|
||||||
@ -550,57 +623,70 @@ if __name__ == "__main__":
|
|||||||
help="Path to the ShareGPT dataset, will be deprecated in\
|
help="Path to the ShareGPT dataset, will be deprecated in\
|
||||||
the next release. The dataset is expected to "
|
the next release. The dataset is expected to "
|
||||||
"be a json in form of list[dict[..., conversations: "
|
"be a json in form of list[dict[..., conversations: "
|
||||||
"list[dict[..., value: <prompt_or_response>]]]]")
|
"list[dict[..., value: <prompt_or_response>]]]]",
|
||||||
parser.add_argument("--dataset-path",
|
)
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help="Path to the dataset")
|
|
||||||
parser.add_argument("--input-len",
|
|
||||||
type=int,
|
|
||||||
default=None,
|
|
||||||
help="Input prompt length for each request")
|
|
||||||
parser.add_argument("--output-len",
|
|
||||||
type=int,
|
|
||||||
default=None,
|
|
||||||
help="Output length for each request. Overrides the "
|
|
||||||
"output length from the dataset.")
|
|
||||||
parser.add_argument("--n",
|
|
||||||
type=int,
|
|
||||||
default=1,
|
|
||||||
help="Number of generated sequences per prompt.")
|
|
||||||
parser.add_argument("--num-prompts",
|
|
||||||
type=int,
|
|
||||||
default=1000,
|
|
||||||
help="Number of prompts to process.")
|
|
||||||
parser.add_argument("--hf-max-batch-size",
|
|
||||||
type=int,
|
|
||||||
default=None,
|
|
||||||
help="Maximum batch size for HF backend.")
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--output-json',
|
"--dataset-path", type=str, default=None, help="Path to the dataset"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--input-len",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Input prompt length for each request",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-len",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Output length for each request. Overrides the "
|
||||||
|
"output length from the dataset.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--n", type=int, default=1, help="Number of generated sequences per prompt."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--num-prompts", type=int, default=1000, help="Number of prompts to process."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--hf-max-batch-size",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Maximum batch size for HF backend.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-json",
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help='Path to save the throughput results in JSON format.')
|
help="Path to save the throughput results in JSON format.",
|
||||||
parser.add_argument("--async-engine",
|
)
|
||||||
action='store_true',
|
parser.add_argument(
|
||||||
default=False,
|
"--async-engine",
|
||||||
help="Use vLLM async engine rather than LLM class.")
|
action="store_true",
|
||||||
parser.add_argument("--disable-frontend-multiprocessing",
|
default=False,
|
||||||
action='store_true',
|
help="Use vLLM async engine rather than LLM class.",
|
||||||
default=False,
|
)
|
||||||
help="Disable decoupled async engine frontend.")
|
parser.add_argument(
|
||||||
|
"--disable-frontend-multiprocessing",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Disable decoupled async engine frontend.",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--disable-detokenize",
|
"--disable-detokenize",
|
||||||
action="store_true",
|
action="store_true",
|
||||||
help=("Do not detokenize the response (i.e. do not include "
|
help=(
|
||||||
"detokenization time in the measurement)"))
|
"Do not detokenize the response (i.e. do not include "
|
||||||
|
"detokenization time in the measurement)"
|
||||||
|
),
|
||||||
|
)
|
||||||
# LoRA
|
# LoRA
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--lora-path",
|
"--lora-path",
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help="Path to the lora adapters to use. This can be an absolute path, "
|
help="Path to the LoRA adapters to use. This can be an absolute path, "
|
||||||
"a relative path, or a Hugging Face model identifier.")
|
"a relative path, or a Hugging Face model identifier.",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--prefix-len",
|
"--prefix-len",
|
||||||
type=int,
|
type=int,
|
||||||
@ -614,7 +700,8 @@ if __name__ == "__main__":
|
|||||||
f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) "
|
f"prefix_len (default: {SonnetDataset.DEFAULT_PREFIX_LEN}) "
|
||||||
"controls how much of the input is fixed lines versus "
|
"controls how much of the input is fixed lines versus "
|
||||||
"random lines, but the total input length remains approximately "
|
"random lines, but the total input length remains approximately "
|
||||||
"input_len tokens.")
|
"input_len tokens.",
|
||||||
|
)
|
||||||
# random dataset
|
# random dataset
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--random-range-ratio",
|
"--random-range-ratio",
|
||||||
@ -628,16 +715,20 @@ if __name__ == "__main__":
|
|||||||
)
|
)
|
||||||
|
|
||||||
# hf dtaset
|
# hf dtaset
|
||||||
parser.add_argument("--hf-subset",
|
parser.add_argument(
|
||||||
type=str,
|
"--hf-subset", type=str, default=None, help="Subset of the HF dataset."
|
||||||
default=None,
|
)
|
||||||
help="Subset of the HF dataset.")
|
parser.add_argument(
|
||||||
parser.add_argument("--hf-split",
|
"--hf-split", type=str, default=None, help="Split of the HF dataset."
|
||||||
type=str,
|
)
|
||||||
default=None,
|
|
||||||
help="Split of the HF dataset.")
|
|
||||||
|
|
||||||
parser = AsyncEngineArgs.add_cli_args(parser)
|
parser = AsyncEngineArgs.add_cli_args(parser)
|
||||||
|
|
||||||
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = create_argument_parser()
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.tokenizer is None:
|
if args.tokenizer is None:
|
||||||
args.tokenizer = args.model
|
args.tokenizer = args.model
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
@ -7,9 +8,9 @@ import os
|
|||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
|
def convert_to_pytorch_benchmark_format(
|
||||||
metrics: dict[str, list],
|
args: argparse.Namespace, metrics: dict[str, list], extra_info: dict[str, Any]
|
||||||
extra_info: dict[str, Any]) -> list:
|
) -> list:
|
||||||
"""
|
"""
|
||||||
Save the benchmark results in the format used by PyTorch OSS benchmark with
|
Save the benchmark results in the format used by PyTorch OSS benchmark with
|
||||||
on metric per record
|
on metric per record
|
||||||
@ -37,12 +38,12 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
tp = record["benchmark"]["extra_info"]["args"].get(
|
tp = record["benchmark"]["extra_info"]["args"].get("tensor_parallel_size")
|
||||||
"tensor_parallel_size")
|
|
||||||
# Save tensor_parallel_size parameter if it's part of the metadata
|
# Save tensor_parallel_size parameter if it's part of the metadata
|
||||||
if not tp and "tensor_parallel_size" in extra_info:
|
if not tp and "tensor_parallel_size" in extra_info:
|
||||||
record["benchmark"]["extra_info"]["args"][
|
record["benchmark"]["extra_info"]["args"]["tensor_parallel_size"] = (
|
||||||
"tensor_parallel_size"] = extra_info["tensor_parallel_size"]
|
extra_info["tensor_parallel_size"]
|
||||||
|
)
|
||||||
|
|
||||||
records.append(record)
|
records.append(record)
|
||||||
|
|
||||||
@ -50,7 +51,6 @@ def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
|
|||||||
|
|
||||||
|
|
||||||
class InfEncoder(json.JSONEncoder):
|
class InfEncoder(json.JSONEncoder):
|
||||||
|
|
||||||
def clear_inf(self, o: Any):
|
def clear_inf(self, o: Any):
|
||||||
if isinstance(o, dict):
|
if isinstance(o, dict):
|
||||||
return {k: self.clear_inf(v) for k, v in o.items()}
|
return {k: self.clear_inf(v) for k, v in o.items()}
|
||||||
@ -66,4 +66,9 @@ class InfEncoder(json.JSONEncoder):
|
|||||||
|
|
||||||
def write_to_json(filename: str, records: list) -> None:
|
def write_to_json(filename: str, records: list) -> None:
|
||||||
with open(filename, "w") as f:
|
with open(filename, "w") as f:
|
||||||
json.dump(records, f, cls=InfEncoder)
|
json.dump(
|
||||||
|
records,
|
||||||
|
f,
|
||||||
|
cls=InfEncoder,
|
||||||
|
default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
|
||||||
|
)
|
||||||
|
|||||||
@ -1,13 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
from argparse import ArgumentParser
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
parser = ArgumentParser()
|
|
||||||
parser.add_argument("--input-path", type=str, required=True)
|
|
||||||
parser.add_argument("--output-path", type=str, required=True)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
args = parser.parse_args()
|
|
||||||
df = pd.read_json(args.input_path, lines=True)
|
|
||||||
df.to_csv(args.output_path)
|
|
||||||
@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import copy
|
import copy
|
||||||
@ -23,8 +24,9 @@ DEFAULT_TP_SIZES = [1]
|
|||||||
|
|
||||||
|
|
||||||
# bench
|
# bench
|
||||||
def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
|
def bench_fn(
|
||||||
**kwargs) -> TMeasurement:
|
label: str, sub_label: str, description: str, fn: Callable, *args, **kwargs
|
||||||
|
) -> TMeasurement:
|
||||||
min_run_time = 1
|
min_run_time = 1
|
||||||
|
|
||||||
globals = {
|
globals = {
|
||||||
@ -41,16 +43,18 @@ def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
|
|||||||
).blocked_autorange(min_run_time=min_run_time)
|
).blocked_autorange(min_run_time=min_run_time)
|
||||||
|
|
||||||
|
|
||||||
def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
def bench_int8(
|
||||||
sub_label: str) -> Iterable[TMeasurement]:
|
dtype: torch.dtype, m: int, k: int, n: int, label: str, sub_label: str
|
||||||
|
) -> Iterable[TMeasurement]:
|
||||||
assert dtype == torch.int8
|
assert dtype == torch.int8
|
||||||
b_compressed, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
|
b_compressed, e, a, b = make_rand_sparse_tensors(torch.int8, m, n, k)
|
||||||
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
|
bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16)
|
||||||
|
|
||||||
out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
|
out = ops.cutlass_scaled_sparse_mm(
|
||||||
torch.bfloat16)
|
a, b_compressed, e, scale_a, scale_b, torch.bfloat16
|
||||||
|
)
|
||||||
out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
|
out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
|
||||||
|
|
||||||
if not torch.allclose(out, out_ref):
|
if not torch.allclose(out, out_ref):
|
||||||
@ -63,54 +67,107 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
|||||||
timers = []
|
timers = []
|
||||||
# pytorch impl - bfloat16
|
# pytorch impl - bfloat16
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
|
bench_fn(
|
||||||
torch.mm, a.to(dtype=torch.bfloat16),
|
label,
|
||||||
b.to(dtype=torch.bfloat16)))
|
sub_label,
|
||||||
|
"pytorch_bf16_bf16_bf16_matmul-no-scales",
|
||||||
|
torch.mm,
|
||||||
|
a.to(dtype=torch.bfloat16),
|
||||||
|
b.to(dtype=torch.bfloat16),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# pytorch impl - float16
|
# pytorch impl - float16
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label, sub_label,
|
bench_fn(
|
||||||
"pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
|
label,
|
||||||
a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
|
sub_label,
|
||||||
|
"pytorch_fp16_fp16_fp16_matmul-no-scales",
|
||||||
|
torch.mm,
|
||||||
|
a.to(dtype=torch.float16),
|
||||||
|
b.to(dtype=torch.float16),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# cutlass impl
|
# cutlass impl
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
|
bench_fn(
|
||||||
ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
|
label,
|
||||||
torch.bfloat16))
|
sub_label,
|
||||||
|
"cutlass_i8_i8_bf16_scaled_mm",
|
||||||
|
ops.cutlass_scaled_mm,
|
||||||
|
a,
|
||||||
|
b,
|
||||||
|
scale_a,
|
||||||
|
scale_b,
|
||||||
|
torch.bfloat16,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# cutlass with bias
|
# cutlass with bias
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
|
bench_fn(
|
||||||
ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
|
label,
|
||||||
bias))
|
sub_label,
|
||||||
|
"cutlass_i8_i8_bf16_scaled_mm_bias",
|
||||||
|
ops.cutlass_scaled_mm,
|
||||||
|
a,
|
||||||
|
b,
|
||||||
|
scale_a,
|
||||||
|
scale_b,
|
||||||
|
torch.bfloat16,
|
||||||
|
bias,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# cutlass sparse impl
|
# cutlass sparse impl
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm",
|
bench_fn(
|
||||||
ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
|
label,
|
||||||
scale_b, torch.bfloat16))
|
sub_label,
|
||||||
|
"cutlass_i8_i8_bf16_scaled_sparse_mm",
|
||||||
|
ops.cutlass_scaled_sparse_mm,
|
||||||
|
a,
|
||||||
|
b_compressed,
|
||||||
|
e,
|
||||||
|
scale_a,
|
||||||
|
scale_b,
|
||||||
|
torch.bfloat16,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# cutlass sparse with bias
|
# cutlass sparse with bias
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
|
bench_fn(
|
||||||
ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
|
label,
|
||||||
scale_b, torch.bfloat16, bias))
|
sub_label,
|
||||||
|
"cutlass_i8_i8_bf16_scaled_sparse_mm_bias",
|
||||||
|
ops.cutlass_scaled_sparse_mm,
|
||||||
|
a,
|
||||||
|
b_compressed,
|
||||||
|
e,
|
||||||
|
scale_a,
|
||||||
|
scale_b,
|
||||||
|
torch.bfloat16,
|
||||||
|
bias,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
return timers
|
return timers
|
||||||
|
|
||||||
|
|
||||||
def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
def bench_fp8(
|
||||||
sub_label: str) -> Iterable[TMeasurement]:
|
dtype: torch.dtype, m: int, k: int, n: int, label: str, sub_label: str
|
||||||
|
) -> Iterable[TMeasurement]:
|
||||||
assert dtype == torch.float8_e4m3fn
|
assert dtype == torch.float8_e4m3fn
|
||||||
b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n,
|
b_compressed, e, a, b = make_rand_sparse_tensors(torch.float8_e4m3fn, m, n, k)
|
||||||
k)
|
|
||||||
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
|
bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16)
|
||||||
|
|
||||||
out = ops.cutlass_scaled_sparse_mm(a, b_compressed, e, scale_a, scale_b,
|
out = ops.cutlass_scaled_sparse_mm(
|
||||||
torch.bfloat16)
|
a, b_compressed, e, scale_a, scale_b, torch.bfloat16
|
||||||
|
)
|
||||||
out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
|
out_ref = ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16)
|
||||||
|
|
||||||
if not torch.allclose(out, out_ref):
|
if not torch.allclose(out, out_ref):
|
||||||
@ -124,97 +181,165 @@ def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
|||||||
|
|
||||||
# pytorch impl w. bf16
|
# pytorch impl w. bf16
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
|
bench_fn(
|
||||||
torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
|
label,
|
||||||
b.to(dtype=torch.bfloat16, device="cuda")))
|
sub_label,
|
||||||
|
"pytorch_bf16_bf16_bf16_matmul-no-scales",
|
||||||
|
torch.mm,
|
||||||
|
a.to(dtype=torch.bfloat16, device="cuda"),
|
||||||
|
b.to(dtype=torch.bfloat16, device="cuda"),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# pytorch impl: bf16 output, without fp8 fast accum
|
# pytorch impl: bf16 output, without fp8 fast accum
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label,
|
bench_fn(
|
||||||
sub_label,
|
label,
|
||||||
"pytorch_fp8_fp8_bf16_scaled_mm",
|
sub_label,
|
||||||
torch._scaled_mm,
|
"pytorch_fp8_fp8_bf16_scaled_mm",
|
||||||
a,
|
torch._scaled_mm,
|
||||||
b,
|
a,
|
||||||
scale_a=scale_a,
|
b,
|
||||||
scale_b=scale_b,
|
scale_a=scale_a,
|
||||||
out_dtype=torch.bfloat16))
|
scale_b=scale_b,
|
||||||
|
out_dtype=torch.bfloat16,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# pytorch impl: bf16 output, with fp8 fast accum
|
# pytorch impl: bf16 output, with fp8 fast accum
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label,
|
bench_fn(
|
||||||
sub_label,
|
label,
|
||||||
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
|
sub_label,
|
||||||
torch._scaled_mm,
|
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
|
||||||
a,
|
torch._scaled_mm,
|
||||||
b,
|
a,
|
||||||
scale_a=scale_a,
|
b,
|
||||||
scale_b=scale_b,
|
scale_a=scale_a,
|
||||||
out_dtype=torch.bfloat16,
|
scale_b=scale_b,
|
||||||
use_fast_accum=True))
|
out_dtype=torch.bfloat16,
|
||||||
|
use_fast_accum=True,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# pytorch impl: fp16 output, without fp8 fast accum
|
# pytorch impl: fp16 output, without fp8 fast accum
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label,
|
bench_fn(
|
||||||
sub_label,
|
label,
|
||||||
"pytorch_fp8_fp8_fp16_scaled_mm",
|
sub_label,
|
||||||
torch._scaled_mm,
|
"pytorch_fp8_fp8_fp16_scaled_mm",
|
||||||
a,
|
torch._scaled_mm,
|
||||||
b,
|
a,
|
||||||
scale_a=scale_a,
|
b,
|
||||||
scale_b=scale_b,
|
scale_a=scale_a,
|
||||||
out_dtype=torch.float16))
|
scale_b=scale_b,
|
||||||
|
out_dtype=torch.float16,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# pytorch impl: fp16 output, with fp8 fast accum
|
# pytorch impl: fp16 output, with fp8 fast accum
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label,
|
bench_fn(
|
||||||
sub_label,
|
label,
|
||||||
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
|
sub_label,
|
||||||
torch._scaled_mm,
|
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
|
||||||
a,
|
torch._scaled_mm,
|
||||||
b,
|
a,
|
||||||
scale_a=scale_a,
|
b,
|
||||||
scale_b=scale_b,
|
scale_a=scale_a,
|
||||||
out_dtype=torch.float16,
|
scale_b=scale_b,
|
||||||
use_fast_accum=True))
|
out_dtype=torch.float16,
|
||||||
|
use_fast_accum=True,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# cutlass impl: bf16 output
|
# cutlass impl: bf16 output
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
|
bench_fn(
|
||||||
ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
|
label,
|
||||||
torch.bfloat16))
|
sub_label,
|
||||||
|
"cutlass_fp8_fp8_bf16_scaled_mm",
|
||||||
|
ops.cutlass_scaled_mm,
|
||||||
|
a,
|
||||||
|
b,
|
||||||
|
scale_a,
|
||||||
|
scale_b,
|
||||||
|
torch.bfloat16,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# cutlass impl: bf16 output
|
# cutlass impl: bf16 output
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_sparse_mm",
|
bench_fn(
|
||||||
ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
|
label,
|
||||||
scale_b, torch.bfloat16))
|
sub_label,
|
||||||
|
"cutlass_fp8_fp8_bf16_scaled_sparse_mm",
|
||||||
|
ops.cutlass_scaled_sparse_mm,
|
||||||
|
a,
|
||||||
|
b_compressed,
|
||||||
|
e,
|
||||||
|
scale_a,
|
||||||
|
scale_b,
|
||||||
|
torch.bfloat16,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# cutlass impl: fp16 output
|
# cutlass impl: fp16 output
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_sparse_mm",
|
bench_fn(
|
||||||
ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
|
label,
|
||||||
scale_b, torch.float16))
|
sub_label,
|
||||||
|
"cutlass_fp8_fp8_fp16_scaled_sparse_mm",
|
||||||
|
ops.cutlass_scaled_sparse_mm,
|
||||||
|
a,
|
||||||
|
b_compressed,
|
||||||
|
e,
|
||||||
|
scale_a,
|
||||||
|
scale_b,
|
||||||
|
torch.float16,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# cutlass impl: bf16 output, with bias
|
# cutlass impl: bf16 output, with bias
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label, sub_label,
|
bench_fn(
|
||||||
"cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias",
|
label,
|
||||||
ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
|
sub_label,
|
||||||
scale_b, torch.bfloat16, bias))
|
"cutlass_fp8_fp8_bf16_scaled_sparse_mm_bias",
|
||||||
|
ops.cutlass_scaled_sparse_mm,
|
||||||
|
a,
|
||||||
|
b_compressed,
|
||||||
|
e,
|
||||||
|
scale_a,
|
||||||
|
scale_b,
|
||||||
|
torch.bfloat16,
|
||||||
|
bias,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# cutlass impl: fp16 output, with bias
|
# cutlass impl: fp16 output, with bias
|
||||||
timers.append(
|
timers.append(
|
||||||
bench_fn(label, sub_label,
|
bench_fn(
|
||||||
"cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias",
|
label,
|
||||||
ops.cutlass_scaled_sparse_mm, a, b_compressed, e, scale_a,
|
sub_label,
|
||||||
scale_b, torch.float16, bias.to(dtype=torch.float16)))
|
"cutlass_fp8_fp8_fp16_scaled_sparse_mm_bias",
|
||||||
|
ops.cutlass_scaled_sparse_mm,
|
||||||
|
a,
|
||||||
|
b_compressed,
|
||||||
|
e,
|
||||||
|
scale_a,
|
||||||
|
scale_b,
|
||||||
|
torch.float16,
|
||||||
|
bias.to(dtype=torch.float16),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
return timers
|
return timers
|
||||||
|
|
||||||
|
|
||||||
def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
def bench(
|
||||||
sub_label: str) -> Iterable[TMeasurement]:
|
dtype: torch.dtype, m: int, k: int, n: int, label: str, sub_label: str
|
||||||
|
) -> Iterable[TMeasurement]:
|
||||||
if dtype == torch.int8:
|
if dtype == torch.int8:
|
||||||
return bench_int8(dtype, m, k, n, label, sub_label)
|
return bench_int8(dtype, m, k, n, label, sub_label)
|
||||||
if dtype == torch.float8_e4m3fn:
|
if dtype == torch.float8_e4m3fn:
|
||||||
@ -228,12 +353,12 @@ def print_timers(timers: Iterable[TMeasurement]):
|
|||||||
compare.print()
|
compare.print()
|
||||||
|
|
||||||
|
|
||||||
def run(dtype: torch.dtype,
|
def run(
|
||||||
MKNs: Iterable[tuple[int, int, int]]) -> Iterable[TMeasurement]:
|
dtype: torch.dtype, MKNs: Iterable[tuple[int, int, int]]
|
||||||
|
) -> Iterable[TMeasurement]:
|
||||||
results = []
|
results = []
|
||||||
for m, k, n in MKNs:
|
for m, k, n in MKNs:
|
||||||
timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
|
timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm", f"MKN=({m}x{k}x{n})")
|
||||||
f"MKN=({m}x{k}x{n})")
|
|
||||||
print_timers(timers)
|
print_timers(timers)
|
||||||
results.extend(timers)
|
results.extend(timers)
|
||||||
|
|
||||||
@ -241,10 +366,12 @@ def run(dtype: torch.dtype,
|
|||||||
|
|
||||||
|
|
||||||
# output makers
|
# output makers
|
||||||
def make_output(data: Iterable[TMeasurement],
|
def make_output(
|
||||||
MKNs: Iterable[tuple[int, int, int]],
|
data: Iterable[TMeasurement],
|
||||||
base_description: str,
|
MKNs: Iterable[tuple[int, int, int]],
|
||||||
timestamp=None):
|
base_description: str,
|
||||||
|
timestamp=None,
|
||||||
|
):
|
||||||
print(f"== All Results {base_description} ====")
|
print(f"== All Results {base_description} ====")
|
||||||
print_timers(data)
|
print_timers(data)
|
||||||
|
|
||||||
@ -258,8 +385,7 @@ def make_output(data: Iterable[TMeasurement],
|
|||||||
|
|
||||||
|
|
||||||
def run_square_bench(args):
|
def run_square_bench(args):
|
||||||
dim_sizes = list(
|
dim_sizes = list(range(args.dim_start, args.dim_end + 1, args.dim_increment))
|
||||||
range(args.dim_start, args.dim_end + 1, args.dim_increment))
|
|
||||||
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
|
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
|
||||||
data = run(args.dtype, MKNs)
|
data = run(args.dtype, MKNs)
|
||||||
|
|
||||||
@ -319,7 +445,7 @@ def run_model_bench(args):
|
|||||||
pkl.dump(all_data, f)
|
pkl.dump(all_data, f)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
|
|
||||||
def to_torch_dtype(dt):
|
def to_torch_dtype(dt):
|
||||||
if dt == "int8":
|
if dt == "int8":
|
||||||
@ -344,12 +470,15 @@ Benchmark Cutlass GEMM.
|
|||||||
Output:
|
Output:
|
||||||
- a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
|
- a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
|
||||||
""", # noqa: E501
|
""", # noqa: E501
|
||||||
formatter_class=argparse.RawTextHelpFormatter)
|
formatter_class=argparse.RawTextHelpFormatter,
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument("--dtype",
|
parser.add_argument(
|
||||||
type=to_torch_dtype,
|
"--dtype",
|
||||||
required=True,
|
type=to_torch_dtype,
|
||||||
help="Available options are ['int8', 'fp8']")
|
required=True,
|
||||||
|
help="Available options are ['int8', 'fp8']",
|
||||||
|
)
|
||||||
subparsers = parser.add_subparsers(dest="cmd")
|
subparsers = parser.add_subparsers(dest="cmd")
|
||||||
|
|
||||||
square_parser = subparsers.add_parser("square_bench")
|
square_parser = subparsers.add_parser("square_bench")
|
||||||
@ -368,19 +497,19 @@ Benchmark Cutlass GEMM.
|
|||||||
range_parser.set_defaults(func=run_range_bench)
|
range_parser.set_defaults(func=run_range_bench)
|
||||||
|
|
||||||
model_parser = subparsers.add_parser("model_bench")
|
model_parser = subparsers.add_parser("model_bench")
|
||||||
model_parser.add_argument("--models",
|
model_parser.add_argument(
|
||||||
nargs="+",
|
"--models",
|
||||||
type=str,
|
nargs="+",
|
||||||
default=DEFAULT_MODELS,
|
type=str,
|
||||||
choices=WEIGHT_SHAPES.keys())
|
default=DEFAULT_MODELS,
|
||||||
model_parser.add_argument("--tp-sizes",
|
choices=WEIGHT_SHAPES.keys(),
|
||||||
nargs="+",
|
)
|
||||||
type=int,
|
model_parser.add_argument(
|
||||||
default=DEFAULT_TP_SIZES)
|
"--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES
|
||||||
model_parser.add_argument("--batch-sizes",
|
)
|
||||||
nargs="+",
|
model_parser.add_argument(
|
||||||
type=int,
|
"--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
|
||||||
default=DEFAULT_BATCH_SIZES)
|
)
|
||||||
model_parser.set_defaults(func=run_model_bench)
|
model_parser.set_defaults(func=run_model_bench)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
# Cutlass bench utils
|
# Cutlass bench utils
|
||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
@ -10,8 +11,9 @@ import vllm._custom_ops as ops
|
|||||||
|
|
||||||
def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
|
def to_fp8(tensor: torch.Tensor) -> torch.Tensor:
|
||||||
finfo = torch.finfo(torch.float8_e4m3fn)
|
finfo = torch.finfo(torch.float8_e4m3fn)
|
||||||
return torch.round(tensor.clamp(
|
return torch.round(tensor.clamp(min=finfo.min, max=finfo.max)).to(
|
||||||
min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
|
dtype=torch.float8_e4m3fn
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def to_int8(tensor: torch.Tensor) -> torch.Tensor:
|
def to_int8(tensor: torch.Tensor) -> torch.Tensor:
|
||||||
@ -26,10 +28,11 @@ def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
|
|||||||
return tensor.to(dtype=torch.float16)
|
return tensor.to(dtype=torch.float16)
|
||||||
|
|
||||||
|
|
||||||
def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
|
def make_rand_tensors(
|
||||||
k: int) -> tuple[torch.Tensor, torch.Tensor]:
|
dtype: torch.dtype, m: int, n: int, k: int
|
||||||
a = torch.randn((m, k), device='cuda') * 5
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
b = torch.randn((n, k), device='cuda').t() * 5
|
a = torch.randn((m, k), device="cuda") * 5
|
||||||
|
b = torch.randn((n, k), device="cuda").t() * 5
|
||||||
|
|
||||||
if dtype == torch.int8:
|
if dtype == torch.int8:
|
||||||
return to_int8(a), to_int8(b)
|
return to_int8(a), to_int8(b)
|
||||||
@ -49,9 +52,7 @@ def prune_to_2_4(tensor):
|
|||||||
|
|
||||||
# Create binary mask
|
# Create binary mask
|
||||||
mask = torch.zeros_like(reshaped)
|
mask = torch.zeros_like(reshaped)
|
||||||
mask.scatter_(dim=1,
|
mask.scatter_(dim=1, index=indices, src=torch.ones_like(indices, dtype=mask.dtype))
|
||||||
index=indices,
|
|
||||||
src=torch.ones_like(indices, dtype=mask.dtype))
|
|
||||||
|
|
||||||
# Apply mask and reshape back
|
# Apply mask and reshape back
|
||||||
pruned = reshaped * mask
|
pruned = reshaped * mask
|
||||||
@ -62,10 +63,11 @@ def prune_to_2_4(tensor):
|
|||||||
return pruned.reshape(original_shape)
|
return pruned.reshape(original_shape)
|
||||||
|
|
||||||
|
|
||||||
def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
|
def make_rand_sparse_tensors(
|
||||||
k: int) -> tuple[torch.Tensor, torch.Tensor]:
|
dtype: torch.dtype, m: int, n: int, k: int
|
||||||
a = torch.randn((m, k), device='cuda') * 5
|
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
b = torch.randn((n, k), device='cuda').t() * 5
|
a = torch.randn((m, k), device="cuda") * 5
|
||||||
|
b = torch.randn((n, k), device="cuda").t() * 5
|
||||||
|
|
||||||
b = prune_to_2_4(b.t()).t()
|
b = prune_to_2_4(b.t()).t()
|
||||||
|
|
||||||
@ -86,9 +88,9 @@ def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
|
|||||||
return b_compressed, e, a, b
|
return b_compressed, e, a, b
|
||||||
|
|
||||||
|
|
||||||
def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype,
|
def make_n_rand_sparse_tensors(
|
||||||
m: int, n: int, k: int) -> \
|
num_tensors: int, dtype: torch.dtype, m: int, n: int, k: int
|
||||||
tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
|
) -> tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
|
||||||
ABs = []
|
ABs = []
|
||||||
for _ in range(num_tensors):
|
for _ in range(num_tensors):
|
||||||
b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
|
b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import copy
|
import copy
|
||||||
@ -16,8 +17,9 @@ from weight_shapes import WEIGHT_SHAPES
|
|||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
w8a8_block_fp8_matmul)
|
w8a8_block_fp8_matmul,
|
||||||
from vllm.utils import FlexibleArgumentParser
|
)
|
||||||
|
from vllm.utils import FlexibleArgumentParser, cdiv
|
||||||
|
|
||||||
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
||||||
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
||||||
@ -25,8 +27,9 @@ DEFAULT_TP_SIZES = [1]
|
|||||||
|
|
||||||
|
|
||||||
# bench
|
# bench
|
||||||
def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
|
def bench_fn(
|
||||||
**kwargs) -> TMeasurement:
|
label: str, sub_label: str, description: str, fn: Callable, *args, **kwargs
|
||||||
|
) -> TMeasurement:
|
||||||
min_run_time = 1
|
min_run_time = 1
|
||||||
|
|
||||||
globals = {
|
globals = {
|
||||||
@ -44,45 +47,48 @@ def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
|
|||||||
|
|
||||||
|
|
||||||
def bench_int8(
|
def bench_int8(
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
m: int,
|
m: int,
|
||||||
k: int,
|
k: int,
|
||||||
n: int,
|
n: int,
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
|
bench_kernels: Optional[list[str]] = None,
|
||||||
|
) -> Iterable[TMeasurement]:
|
||||||
"""Benchmark INT8-based kernels."""
|
"""Benchmark INT8-based kernels."""
|
||||||
assert dtype == torch.int8
|
assert dtype == torch.int8
|
||||||
a, b = make_rand_tensors(torch.int8, m, n, k)
|
a, b = make_rand_tensors(torch.int8, m, n, k)
|
||||||
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
|
bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16)
|
||||||
azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
|
azp = torch.zeros((m,), device="cuda", dtype=torch.int32)
|
||||||
azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)
|
azp_adj = torch.zeros((n,), device="cuda", dtype=torch.int32)
|
||||||
|
|
||||||
bench_fns = {
|
bench_fns = {
|
||||||
"pytorch_bf16_bf16_bf16_matmul-no-scales":
|
"pytorch_bf16_bf16_bf16_matmul-no-scales": lambda: torch.mm(
|
||||||
lambda: torch.mm(a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
|
a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
|
||||||
),
|
),
|
||||||
"pytorch_fp16_fp16_fp16_matmul-no-scales":
|
"pytorch_fp16_fp16_fp16_matmul-no-scales": lambda: torch.mm(
|
||||||
lambda: torch.mm(a.to(dtype=torch.float16), b.to(dtype=torch.float16)),
|
a.to(dtype=torch.float16), b.to(dtype=torch.float16)
|
||||||
"cutlass_i8_i8_bf16_scaled_mm":
|
),
|
||||||
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16),
|
"cutlass_i8_i8_bf16_scaled_mm": lambda: ops.cutlass_scaled_mm(
|
||||||
"cutlass_i8_i8_bf16_scaled_mm_bias":
|
a, b, scale_a, scale_b, torch.bfloat16
|
||||||
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16,
|
),
|
||||||
bias),
|
"cutlass_i8_i8_bf16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
|
||||||
"cutlass_i8_i8_bf16_scaled_mm_azp":
|
a, b, scale_a, scale_b, torch.bfloat16, bias
|
||||||
lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
|
),
|
||||||
bfloat16, azp_adj),
|
"cutlass_i8_i8_bf16_scaled_mm_azp": lambda: ops.cutlass_scaled_mm_azp(
|
||||||
"cutlass_i8_i8_bf16_scaled_mm_azp_bias":
|
a, b, scale_a, scale_b, torch.bfloat16, azp_adj
|
||||||
lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
|
),
|
||||||
bfloat16, azp_adj, None, bias),
|
"cutlass_i8_i8_bf16_scaled_mm_azp_bias": lambda: ops.cutlass_scaled_mm_azp(
|
||||||
"cutlass_i8_i8_bf16_scaled_mm_azp_pt":
|
a, b, scale_a, scale_b, torch.bfloat16, azp_adj, None, bias
|
||||||
lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
|
),
|
||||||
bfloat16, azp_adj, azp),
|
"cutlass_i8_i8_bf16_scaled_mm_azp_pt": lambda: ops.cutlass_scaled_mm_azp(
|
||||||
"cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias":
|
a, b, scale_a, scale_b, torch.bfloat16, azp_adj, azp
|
||||||
lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
|
),
|
||||||
bfloat16, azp_adj, azp, bias),
|
"cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias": lambda: ops.cutlass_scaled_mm_azp(
|
||||||
|
a, b, scale_a, scale_b, torch.bfloat16, azp_adj, azp, bias
|
||||||
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
timers = []
|
timers = []
|
||||||
@ -96,73 +102,68 @@ def bench_int8(
|
|||||||
|
|
||||||
|
|
||||||
def bench_fp8(
|
def bench_fp8(
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
m: int,
|
m: int,
|
||||||
k: int,
|
k: int,
|
||||||
n: int,
|
n: int,
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
|
bench_kernels: Optional[list[str]] = None,
|
||||||
|
) -> Iterable[TMeasurement]:
|
||||||
"""Benchmark FP8-based kernels."""
|
"""Benchmark FP8-based kernels."""
|
||||||
assert dtype == torch.float8_e4m3fn
|
assert dtype == torch.float8_e4m3fn
|
||||||
a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
|
a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
|
||||||
a_cont = a.contiguous()
|
a_cont = a.contiguous()
|
||||||
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
block_scale_a = torch.rand((m, k // 128),
|
|
||||||
device="cuda",
|
block_scale_a = torch.rand((m, cdiv(k, 128)), device="cuda", dtype=torch.float32)
|
||||||
dtype=torch.float32)
|
block_scale_b = torch.rand(
|
||||||
block_scale_b = torch.rand((k // 128, n // 128),
|
cdiv(k, 128), cdiv(n, 128), device="cuda", dtype=torch.float32
|
||||||
device="cuda",
|
)
|
||||||
dtype=torch.float32)
|
|
||||||
block_scale_a_M_major = block_scale_a.t().contiguous().t()
|
block_scale_a_M_major = block_scale_a.t().contiguous().t()
|
||||||
block_scale_b_K_major = block_scale_b.t().contiguous().t()
|
block_scale_b_K_major = block_scale_b.t().contiguous().t()
|
||||||
bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
|
bias = torch.zeros((n,), device="cuda", dtype=torch.bfloat16)
|
||||||
|
|
||||||
print(m, k, n)
|
print(m, k, n)
|
||||||
|
|
||||||
bench_fns = {
|
bench_fns = {
|
||||||
"pytorch_bf16_bf16_bf16_matmul-no-scales":
|
"pytorch_bf16_bf16_bf16_matmul-no-scales": lambda: torch.mm(
|
||||||
lambda: torch.mm(a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
|
a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
|
||||||
),
|
),
|
||||||
"pytorch_fp16_fp16_fp16_matmul-no-scales":
|
"pytorch_fp16_fp16_fp16_matmul-no-scales": lambda: torch.mm(
|
||||||
lambda: torch.mm(a.to(dtype=torch.float16), b.to(dtype=torch.float16)),
|
a.to(dtype=torch.float16), b.to(dtype=torch.float16)
|
||||||
"pytorch_fp8_fp8_fp16_scaled_mm":
|
),
|
||||||
lambda: torch._scaled_mm(
|
"pytorch_fp8_fp8_fp16_scaled_mm": lambda: torch._scaled_mm(
|
||||||
a, b, scale_a, scale_b, out_dtype=torch.float16),
|
a, b, scale_a, scale_b, out_dtype=torch.float16
|
||||||
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum":
|
),
|
||||||
lambda: torch._scaled_mm(a,
|
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum": lambda: torch._scaled_mm(
|
||||||
b,
|
a, b, scale_a, scale_b, out_dtype=torch.float16, use_fast_accum=True
|
||||||
scale_a,
|
),
|
||||||
scale_b,
|
"pytorch_fp8_fp8_bf16_scaled_mm": lambda: torch._scaled_mm(
|
||||||
out_dtype=torch.float16,
|
a, b, scale_a, scale_b, out_dtype=torch.bfloat16
|
||||||
use_fast_accum=True),
|
),
|
||||||
"pytorch_fp8_fp8_bf16_scaled_mm":
|
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum": lambda: torch._scaled_mm(
|
||||||
lambda: torch._scaled_mm(
|
a, b, scale_a, scale_b, out_dtype=torch.bfloat16, use_fast_accum=True
|
||||||
a, b, scale_a, scale_b, out_dtype=torch.bfloat16),
|
),
|
||||||
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum":
|
"cutlass_fp8_fp8_bf16_scaled_mm": lambda: ops.cutlass_scaled_mm(
|
||||||
lambda: torch._scaled_mm(a,
|
a, b, scale_a, scale_b, torch.bfloat16
|
||||||
b,
|
),
|
||||||
scale_a,
|
"cutlass_fp8_fp8_fp16_scaled_mm": lambda: ops.cutlass_scaled_mm(
|
||||||
scale_b,
|
a, b, scale_a, scale_b, torch.float16
|
||||||
out_dtype=torch.bfloat16,
|
),
|
||||||
use_fast_accum=True),
|
"cutlass_fp8_fp8_bf16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
|
||||||
"cutlass_fp8_fp8_bf16_scaled_mm":
|
a, b, scale_a, scale_b, torch.bfloat16, bias
|
||||||
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16),
|
),
|
||||||
"cutlass_fp8_fp8_fp16_scaled_mm":
|
"cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
|
||||||
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.float16),
|
a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
|
||||||
"cutlass_fp8_fp8_bf16_scaled_mm_bias":
|
),
|
||||||
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16,
|
"triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
|
||||||
bias),
|
a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
|
||||||
"cutlass_fp8_fp8_fp16_scaled_mm_bias":
|
),
|
||||||
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.float16,
|
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
|
||||||
bias.to(dtype=torch.float16)),
|
a, b, block_scale_a_M_major, block_scale_b_K_major, torch.float16
|
||||||
"triton_fp8_fp8_fp16_scaled_mm_blockwise":
|
),
|
||||||
lambda: w8a8_block_fp8_matmul(a_cont, b.t(), block_scale_a,
|
|
||||||
block_scale_b.t(), (128, 128)),
|
|
||||||
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise":
|
|
||||||
lambda: ops.cutlass_scaled_mm(a, b, block_scale_a_M_major,
|
|
||||||
block_scale_b_K_major, torch.float16),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
timers = []
|
timers = []
|
||||||
@ -175,13 +176,15 @@ def bench_fp8(
|
|||||||
return timers
|
return timers
|
||||||
|
|
||||||
|
|
||||||
def bench(dtype: torch.dtype,
|
def bench(
|
||||||
m: int,
|
dtype: torch.dtype,
|
||||||
k: int,
|
m: int,
|
||||||
n: int,
|
k: int,
|
||||||
label: str,
|
n: int,
|
||||||
sub_label: str,
|
label: str,
|
||||||
bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
|
sub_label: str,
|
||||||
|
bench_kernels: Optional[list[str]] = None,
|
||||||
|
) -> Iterable[TMeasurement]:
|
||||||
if dtype == torch.int8:
|
if dtype == torch.int8:
|
||||||
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
|
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
|
||||||
if dtype == torch.float8_e4m3fn:
|
if dtype == torch.float8_e4m3fn:
|
||||||
@ -195,27 +198,33 @@ def print_timers(timers: Iterable[TMeasurement]):
|
|||||||
compare.print()
|
compare.print()
|
||||||
|
|
||||||
|
|
||||||
def run(dtype: torch.dtype,
|
def run(
|
||||||
MKNs: Iterable[tuple[int, int, int]],
|
dtype: torch.dtype,
|
||||||
bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
|
MKNs: Iterable[tuple[int, int, int]],
|
||||||
|
bench_kernels: Optional[list[str]] = None,
|
||||||
|
) -> Iterable[TMeasurement]:
|
||||||
results = []
|
results = []
|
||||||
for m, k, n in MKNs:
|
for m, k, n in MKNs:
|
||||||
timers = bench(dtype,
|
timers = bench(
|
||||||
m,
|
dtype,
|
||||||
k,
|
m,
|
||||||
n,
|
k,
|
||||||
f"scaled-{dtype}-gemm",
|
n,
|
||||||
f"MKN=({m}x{k}x{n})",
|
f"scaled-{dtype}-gemm",
|
||||||
bench_kernels=bench_kernels)
|
f"MKN=({m}x{k}x{n})",
|
||||||
|
bench_kernels=bench_kernels,
|
||||||
|
)
|
||||||
print_timers(timers)
|
print_timers(timers)
|
||||||
results.extend(timers)
|
results.extend(timers)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def make_output(data: Iterable[TMeasurement],
|
def make_output(
|
||||||
MKNs: Iterable[tuple[int, int, int]],
|
data: Iterable[TMeasurement],
|
||||||
base_description: str,
|
MKNs: Iterable[tuple[int, int, int]],
|
||||||
timestamp=None):
|
base_description: str,
|
||||||
|
timestamp=None,
|
||||||
|
):
|
||||||
print(f"== All Results {base_description} ====")
|
print(f"== All Results {base_description} ====")
|
||||||
print_timers(data)
|
print_timers(data)
|
||||||
|
|
||||||
@ -226,8 +235,7 @@ def make_output(data: Iterable[TMeasurement],
|
|||||||
|
|
||||||
|
|
||||||
def run_square_bench(args):
|
def run_square_bench(args):
|
||||||
dim_sizes = list(
|
dim_sizes = list(range(args.dim_start, args.dim_end + 1, args.dim_increment))
|
||||||
range(args.dim_start, args.dim_end + 1, args.dim_increment))
|
|
||||||
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
|
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
|
||||||
data = run(args.dtype, MKNs, bench_kernels=args.kernels)
|
data = run(args.dtype, MKNs, bench_kernels=args.kernels)
|
||||||
make_output(data, MKNs, f"square_bench-{args.dtype}")
|
make_output(data, MKNs, f"square_bench-{args.dtype}")
|
||||||
@ -285,7 +293,7 @@ def run_model_bench(args):
|
|||||||
pkl.dump(all_data, f)
|
pkl.dump(all_data, f)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
|
|
||||||
def to_torch_dtype(dt):
|
def to_torch_dtype(dt):
|
||||||
if dt == "int8":
|
if dt == "int8":
|
||||||
@ -310,19 +318,21 @@ Benchmark Cutlass GEMM.
|
|||||||
Output:
|
Output:
|
||||||
- a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
|
- a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
|
||||||
""", # noqa: E501
|
""", # noqa: E501
|
||||||
formatter_class=argparse.RawTextHelpFormatter)
|
formatter_class=argparse.RawTextHelpFormatter,
|
||||||
|
)
|
||||||
|
|
||||||
parser.add_argument("--dtype",
|
parser.add_argument(
|
||||||
type=to_torch_dtype,
|
"--dtype",
|
||||||
required=True,
|
type=to_torch_dtype,
|
||||||
help="Available options are ['int8', 'fp8']")
|
required=True,
|
||||||
|
help="Available options are ['int8', 'fp8']",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--kernels",
|
"--kernels",
|
||||||
nargs="+",
|
nargs="+",
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help=
|
help="Exact names of the kernels to benchmark. If not set, runs all kernels.",
|
||||||
"Exact names of the kernels to benchmark. If not set, runs all kernels."
|
|
||||||
)
|
)
|
||||||
|
|
||||||
subparsers = parser.add_subparsers(dest="cmd")
|
subparsers = parser.add_subparsers(dest="cmd")
|
||||||
@ -343,19 +353,19 @@ Benchmark Cutlass GEMM.
|
|||||||
range_parser.set_defaults(func=run_range_bench)
|
range_parser.set_defaults(func=run_range_bench)
|
||||||
|
|
||||||
model_parser = subparsers.add_parser("model_bench")
|
model_parser = subparsers.add_parser("model_bench")
|
||||||
model_parser.add_argument("--models",
|
model_parser.add_argument(
|
||||||
nargs="+",
|
"--models",
|
||||||
type=str,
|
nargs="+",
|
||||||
default=DEFAULT_MODELS,
|
type=str,
|
||||||
choices=WEIGHT_SHAPES.keys())
|
default=DEFAULT_MODELS,
|
||||||
model_parser.add_argument("--tp-sizes",
|
choices=WEIGHT_SHAPES.keys(),
|
||||||
nargs="+",
|
)
|
||||||
type=int,
|
model_parser.add_argument(
|
||||||
default=DEFAULT_TP_SIZES)
|
"--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES
|
||||||
model_parser.add_argument("--batch-sizes",
|
)
|
||||||
nargs="+",
|
model_parser.add_argument(
|
||||||
type=int,
|
"--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
|
||||||
default=DEFAULT_BATCH_SIZES)
|
)
|
||||||
model_parser.set_defaults(func=run_model_bench)
|
model_parser.set_defaults(func=run_model_bench)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|||||||
@ -1,4 +1,5 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
|
||||||
# Weight Shapes are in the format
|
# Weight Shapes are in the format
|
||||||
# ([K, N], TP_SPLIT_DIM)
|
# ([K, N], TP_SPLIT_DIM)
|
||||||
@ -42,4 +43,4 @@ WEIGHT_SHAPES = {
|
|||||||
([8192, 57344], 1),
|
([8192, 57344], 1),
|
||||||
([28672, 8192], 0),
|
([28672, 8192], 0),
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user