mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 23:03:52 +08:00
Compare commits
1647 Commits
v0.6.6
...
khluu/try_
| Author | SHA1 | Date | |
|---|---|---|---|
| db9dfcfa6a | |||
| 9ef98d527e | |||
| 93491aefc7 | |||
| 7acd539cd7 | |||
| e75a6301bd | |||
| a79cc68b3a | |||
| 7e3f7a4ee7 | |||
| 9ec8257914 | |||
| 38327cf454 | |||
| dfa82e2a3d | |||
| e59ca942f5 | |||
| a57a3044aa | |||
| 4e5a0f6ae2 | |||
| b63bd14999 | |||
| 2041c0e360 | |||
| 085cbc4f9f | |||
| 2b93162fb0 | |||
| 2e45bd29fe | |||
| 51d7c6a2b2 | |||
| f3aca1ee30 | |||
| 8dd41d6bcc | |||
| 0a298ea418 | |||
| d330558bab | |||
| 656fd72976 | |||
| 79455cf421 | |||
| 30d6a015e0 | |||
| 8af5a5c4e5 | |||
| 3a5f0afcd2 | |||
| c7e63aa4d8 | |||
| 4a9ce1784c | |||
| 7e4e709b43 | |||
| 63d8eabed0 | |||
| e830b01383 | |||
| ff6473980d | |||
| a164aea35d | |||
| a76f547e11 | |||
| b7b7676d67 | |||
| e6e3c55ef2 | |||
| f98a4920f9 | |||
| d4bfc23ef0 | |||
| 9a2160fa55 | |||
| 2de4118243 | |||
| 239b7befdd | |||
| 09e974d483 | |||
| e5ef4fa99a | |||
| 037bcd942c | |||
| c2e7507ad4 | |||
| 3aa2b6a637 | |||
| 555aa21905 | |||
| e7ae3bf3d6 | |||
| b932c048ac | |||
| e85829450d | |||
| effc5d24fa | |||
| 18ed3132d2 | |||
| 9b459eca88 | |||
| 70fedd0f79 | |||
| bb103b29bf | |||
| 248e76c4df | |||
| 803d5c35f3 | |||
| 7fd8c0f85c | |||
| 44c3a5abc3 | |||
| 6909a76201 | |||
| 045533716b | |||
| 3c0ff914ac | |||
| 2bc4be4e32 | |||
| c67abd614f | |||
| 6fa7cd3dbc | |||
| 94744ba41a | |||
| 4965ec42d2 | |||
| 73aa7041bf | |||
| 7c1f760024 | |||
| da461f3cbf | |||
| 5b800f0932 | |||
| 8427f70493 | |||
| 7a7992085b | |||
| 1286211f57 | |||
| 6d531ad7b8 | |||
| 762b424a52 | |||
| de1cb38769 | |||
| c802f5430d | |||
| cff8991a50 | |||
| f3f8d8fff4 | |||
| 26df46ee59 | |||
| c3f687ac22 | |||
| 04437e313d | |||
| 038bededba | |||
| d03308be0c | |||
| c6bc0034d0 | |||
| 70e132244a | |||
| 47e9038d23 | |||
| 432cf22a6a | |||
| 2914006fe0 | |||
| 7329ff5468 | |||
| 541d1df486 | |||
| 3b00ff9138 | |||
| 91276c5721 | |||
| 0b4167526d | |||
| fd5fd26902 | |||
| 3bbaacbe15 | |||
| a10314c6b3 | |||
| 70f2c2a709 | |||
| 280d074103 | |||
| 32b14baf8a | |||
| 2d9045fce8 | |||
| 355f66348c | |||
| 8693e47e6a | |||
| cec8c7d7f8 | |||
| 4d0ec37267 | |||
| e7f720ea56 | |||
| 4ae17bf1e2 | |||
| 8a49eea74b | |||
| b4245a48df | |||
| 4e0f6076be | |||
| 726efc6a32 | |||
| bd45912b99 | |||
| 15dac210f0 | |||
| 112b3e5b3b | |||
| 32d669275b | |||
| 4098b72210 | |||
| 46450b8d33 | |||
| 13ac9cab21 | |||
| 66aa4c0bf4 | |||
| 247181536f | |||
| 07bf813fb5 | |||
| 8958217ad5 | |||
| ac5bc615b0 | |||
| 8063dfc61a | |||
| 6278bc829e | |||
| 3f532cb6a6 | |||
| e6c9053f9e | |||
| 43ed4143c4 | |||
| f4c98b4d4c | |||
| e1e0fd7543 | |||
| df8d3d1287 | |||
| 619d3de8bd | |||
| ecff8309a3 | |||
| dcf2a590f5 | |||
| 54aa619459 | |||
| fb22be5817 | |||
| 7f301dd8ef | |||
| 8095341a01 | |||
| 69db16a46a | |||
| ce78f9af4e | |||
| 9239bf718e | |||
| 7a6d45bc8a | |||
| e74ff409e0 | |||
| 7a888271f5 | |||
| 9d119a86ae | |||
| b2e85e26f4 | |||
| dd8a29da99 | |||
| 27df5199d9 | |||
| 35fad35a48 | |||
| 733e7c9e95 | |||
| 0af4d764d6 | |||
| e64afa455c | |||
| 1711b929b6 | |||
| c091c0a588 | |||
| 1aa162e030 | |||
| cf5c8f1686 | |||
| 4ec2cee000 | |||
| 99f536f830 | |||
| 5ebf66748b | |||
| 781d056280 | |||
| 5aefd6ac31 | |||
| 6c663dfd5e | |||
| 33437bc6e7 | |||
| 23114d3364 | |||
| 997c8811d6 | |||
| e42389f9d7 | |||
| ff38f0a32c | |||
| a5cfbab3c8 | |||
| ac3cd6e83c | |||
| 082ab86f5f | |||
| 6aa196c8dc | |||
| a0dd7dcd49 | |||
| e977c11111 | |||
| 5f063a80bd | |||
| 5d8e1c9279 | |||
| 0a049c7d86 | |||
| d0cfec7ab9 | |||
| a608160027 | |||
| 3f04a7fbf2 | |||
| 5994430b84 | |||
| a9e879b316 | |||
| 3e2f37a69a | |||
| 4f044b1d67 | |||
| 4157f563b4 | |||
| 051da7efe3 | |||
| 25f560a62c | |||
| a09ad90a72 | |||
| 10b34e36b9 | |||
| b5269db959 | |||
| 6db94571d7 | |||
| 97cfa65df7 | |||
| 911c8eb000 | |||
| ebcebeeb6b | |||
| f533b5837f | |||
| 8279201ce6 | |||
| 23fdab00a8 | |||
| 623e2ed29f | |||
| 9d72daf4ce | |||
| 6dd55af6c9 | |||
| 3eb08ed9b1 | |||
| 5eeadc2642 | |||
| 3aee6573dc | |||
| 9cc645141d | |||
| 0893567db9 | |||
| 8abe69b499 | |||
| 761702fd19 | |||
| 9606d572ed | |||
| cbcdf2c609 | |||
| 038de04d7b | |||
| 6b3cc75be0 | |||
| 7ffcccfa5c | |||
| cc8accfd53 | |||
| 948ab03e7e | |||
| 5797fb97e9 | |||
| 3892e58ad7 | |||
| d20e261199 | |||
| f622dbcf39 | |||
| dccf535f8e | |||
| 9c5c81b0da | |||
| d6cd59f122 | |||
| bc8ed3c4ba | |||
| b9bd76ca14 | |||
| 6ebaf9ac71 | |||
| f90d34b498 | |||
| f68cce8e64 | |||
| 09b6a95551 | |||
| 50c9636d87 | |||
| 0661cfef7a | |||
| a827aa815d | |||
| b877031d80 | |||
| dd861b992f | |||
| eb63ea1e18 | |||
| 2f4bd358f1 | |||
| 8a8b30eac1 | |||
| 2fa0e1396b | |||
| 1c2bec0f82 | |||
| ec870fba9a | |||
| df1430265c | |||
| 4c69e228b3 | |||
| 790b79750b | |||
| cfbb8c930f | |||
| baec0d4de9 | |||
| c21b99b912 | |||
| 93a00d7dde | |||
| 61e8c18350 | |||
| 8afcd0f633 | |||
| 91ca929dc7 | |||
| 84e00adc8a | |||
| 47c7126213 | |||
| a989ca2bf6 | |||
| 0fa3970deb | |||
| da6ea29f7a | |||
| 7297941b38 | |||
| f8a08cb90d | |||
| b15fd2be2a | |||
| e588ac237c | |||
| 5df2da5b97 | |||
| 11b986b3fb | |||
| 296f927f24 | |||
| 0032903a5b | |||
| 47195057e9 | |||
| 6edbfa924d | |||
| 1e508343e1 | |||
| 2e0b4cfde0 | |||
| 10f55fe6c5 | |||
| d3ccbd6350 | |||
| 0cfe7d386d | |||
| 0c6f5023c3 | |||
| 06dd08256f | |||
| 2b22290ce0 | |||
| d8e82bc06d | |||
| 086b56824c | |||
| 5a0905ba2a | |||
| a8f12a63fd | |||
| 69ae2380c6 | |||
| 27261e40a6 | |||
| e3f813c33b | |||
| c607a2652b | |||
| 3d45e3d749 | |||
| 742369d35a | |||
| bfe2fe0af4 | |||
| a8652f4f0f | |||
| 2f726b241e | |||
| a597a57595 | |||
| ae65f3e237 | |||
| 34868b106a | |||
| 1f16b7fe74 | |||
| b88be22165 | |||
| d8c6d7d6b5 | |||
| 40828ce5fe | |||
| ffa443afed | |||
| 70e500cad9 | |||
| 4cb1c05c9e | |||
| c47aafa37c | |||
| cfbca8a2f2 | |||
| 0fe5609874 | |||
| 22d33baca2 | |||
| b0e96aaebb | |||
| 8310e0b59b | |||
| 26dd972adb | |||
| 61c7a1b856 | |||
| 374ee287d8 | |||
| a4d83661d7 | |||
| 8363cd093d | |||
| 6c5a3195db | |||
| 073d1ed354 | |||
| 3d446433ec | |||
| 1fe0fd12d3 | |||
| dafb4e504a | |||
| 68cf1601d3 | |||
| 61f412187d | |||
| 05ccd0aa35 | |||
| f690372b68 | |||
| 8b3e94a357 | |||
| 437f9162d0 | |||
| 4f065f12f5 | |||
| 228b768db6 | |||
| 027827cc1d | |||
| 72a8639b68 | |||
| 99abb8b650 | |||
| 3a1e648158 | |||
| 46c759c165 | |||
| 179a619c21 | |||
| 452e8fd968 | |||
| 8b793f7ec6 | |||
| af35d3a3cc | |||
| 3b457143d2 | |||
| ab656f2c2f | |||
| 64fc2193dc | |||
| dd732028f5 | |||
| 414919138b | |||
| db7c8ca910 | |||
| f863ffc965 | |||
| 400d483e87 | |||
| d1695758b2 | |||
| 53a0cf8b95 | |||
| 5eeabc2a44 | |||
| 18551e820c | |||
| e41e160263 | |||
| b89fb2a4a1 | |||
| 5340b0e221 | |||
| 37e3806132 | |||
| c0efdd655b | |||
| aaaec52ad9 | |||
| e1eb45d397 | |||
| 89fca671fb | |||
| d20b0c139c | |||
| 166a168b0f | |||
| 2bb0e1a799 | |||
| 6eaf1e5c52 | |||
| 868a8c5b2c | |||
| b4ad56c1bd | |||
| 69698f257e | |||
| cd0cd85102 | |||
| 0a74bfce9c | |||
| dd3b865854 | |||
| 9b87a579aa | |||
| b539222d4e | |||
| 8d6cf89526 | |||
| 583a9778e0 | |||
| a73e183e36 | |||
| 1e799b7ec1 | |||
| 7f6c5ee06c | |||
| faa0275730 | |||
| 8a5a9b70d7 | |||
| bb3aeddfaf | |||
| aecc780dba | |||
| 90df7f23aa | |||
| b9b5bdfc7d | |||
| 31060b2757 | |||
| fc1f67715d | |||
| f6137adbcb | |||
| e53b1350f2 | |||
| d30aa7e9e6 | |||
| d1ad2a57af | |||
| b82662d952 | |||
| 71c1e07107 | |||
| b30c75dda4 | |||
| def232e122 | |||
| 3453b964a3 | |||
| 61c6a5a796 | |||
| 74bc397b0a | |||
| f58aea002c | |||
| 3556a41434 | |||
| 9ed6ee92d6 | |||
| ee3778d5fc | |||
| aaacf17324 | |||
| 4c7629cae9 | |||
| e0fdfa1608 | |||
| 5952d8ab61 | |||
| a2ae496589 | |||
| 877e352262 | |||
| d4d93db2c5 | |||
| 8c0d15d5c5 | |||
| 97ac781c62 | |||
| 776dcec8fe | |||
| ccf02fcbae | |||
| acaea3bb07 | |||
| 9f37422779 | |||
| dd344e0342 | |||
| 54a8804455 | |||
| bbd94a19fc | |||
| 233ffce1eb | |||
| 40677783aa | |||
| 14f301b541 | |||
| 46f98893dd | |||
| fe66b34728 | |||
| 270a5da495 | |||
| 7097b4cc1c | |||
| 977a16772c | |||
| 73deea2fdb | |||
| 9d2b4a70f4 | |||
| 0b0d6421b2 | |||
| 1140991a7b | |||
| 613c5bb945 | |||
| fd8e055ffb | |||
| ab93f1360f | |||
| 40253bab44 | |||
| c77620d22d | |||
| 989ecd2007 | |||
| 54cc46f3eb | |||
| 601bd3268e | |||
| 09269b3127 | |||
| 27b50f1fe6 | |||
| 9532c49836 | |||
| 0c2af17c76 | |||
| a6e0d096dd | |||
| d3d4956261 | |||
| 4059adc31b | |||
| f1f632d9ec | |||
| 95d680b862 | |||
| fb4c7f8ef0 | |||
| 0b1cfa6180 | |||
| 32ef4983cd | |||
| ad19c8a003 | |||
| 2a602b055a | |||
| 7888e1d0a3 | |||
| 60c872d4b6 | |||
| 3fb17d26c8 | |||
| d47807ba08 | |||
| 02fcaa3d0a | |||
| 8a4a2efc6f | |||
| 8e9ffd37d6 | |||
| 01b3fd0af7 | |||
| f53a0586b9 | |||
| b1cc4dfef5 | |||
| 382403921f | |||
| a73122de96 | |||
| bd44b812cb | |||
| 55211b01e8 | |||
| 5d043c1685 | |||
| 36d1ccb286 | |||
| 1bc3b739c4 | |||
| 1bd32bc8dd | |||
| 128bf75283 | |||
| a94a699c3f | |||
| ab426ec9c0 | |||
| 165290d357 | |||
| ce20124671 | |||
| 53be4a8634 | |||
| f5d3acd474 | |||
| 916836bbfb | |||
| d9f83d6206 | |||
| 4a754fcf15 | |||
| c0c25e25fa | |||
| 45f3f3f59e | |||
| ff47aab056 | |||
| debd6bbf09 | |||
| 5c538c37b2 | |||
| e22ee1e7a2 | |||
| e392d85831 | |||
| 77a318bd01 | |||
| 80e78d02ac | |||
| 4a42b9f5d6 | |||
| 47532cd9f4 | |||
| 36e0c8f7da | |||
| 9f583e360c | |||
| b706d898af | |||
| 863d315c86 | |||
| d374f04a33 | |||
| 61a01b27a7 | |||
| 53056731fd | |||
| 4cbf286794 | |||
| c6e14a61ab | |||
| 07b4b7a37f | |||
| 07964e2f30 | |||
| 4bf82d4b90 | |||
| 9ab326713f | |||
| af295e9b01 | |||
| a1c8f3796c | |||
| 08a1a1121d | |||
| 1477ffc381 | |||
| 70b808fe1a | |||
| 63d635d179 | |||
| 1fc973c0b5 | |||
| c982ac5722 | |||
| 4290b704ff | |||
| c91b64f749 | |||
| d6123170d5 | |||
| 485afdd3cb | |||
| 90e88ab756 | |||
| 04421dff8a | |||
| 432d6dad15 | |||
| 5ff0d32580 | |||
| 0967110e42 | |||
| fb0acb6c72 | |||
| 92b0ce2ac7 | |||
| bc2d4473bf | |||
| 3b352a2f92 | |||
| dea985aef0 | |||
| 39be30351f | |||
| 001a9c7b0d | |||
| 89cdaa83e7 | |||
| b0746fae3d | |||
| 60a98b2de5 | |||
| 460f553a6d | |||
| 1253b15774 | |||
| dc74613fa2 | |||
| a21076ed3a | |||
| 212007b168 | |||
| fb16eea48b | |||
| 73ae0b44e9 | |||
| 6d7f037748 | |||
| 10f7552789 | |||
| b0d541947a | |||
| 5f0b53c6ea | |||
| eb8b5eb183 | |||
| 9513290032 | |||
| 0d5e73d30e | |||
| 609ef61fea | |||
| db84f5eb3b | |||
| 206e2577fa | |||
| e02883c400 | |||
| 9085aabd62 | |||
| 8d5aa466fb | |||
| 0b7f06b447 | |||
| 03fe18ae0f | |||
| cb8bdfade2 | |||
| 33f227e16b | |||
| cfd0ae8234 | |||
| 7caff01a7b | |||
| be0b399d74 | |||
| b8b0ccbd2d | |||
| c908a07f57 | |||
| 7b6fd6e486 | |||
| 47512b3200 | |||
| 3b9c6c6947 | |||
| 4aae667668 | |||
| 9f3bc0f58c | |||
| 980385f8c1 | |||
| ca7a2d5f28 | |||
| 333681408f | |||
| ef64044079 | |||
| 66e16a038e | |||
| e1f0835ae0 | |||
| 8ed5421aaa | |||
| c6359e8ca6 | |||
| 952a074980 | |||
| d0feea31c7 | |||
| 58abe35455 | |||
| f7ebad2307 | |||
| 80e9afb5bc | |||
| 1e3598edeb | |||
| f7a6bd0fa1 | |||
| 0ca3b8e01c | |||
| cc10281498 | |||
| 05fb6718f0 | |||
| 12c29a881f | |||
| 70da0c0748 | |||
| c1588a2c94 | |||
| 8ca7a71df7 | |||
| 63137cd922 | |||
| ddd1ef66ec | |||
| e5e03c2c1b | |||
| e1744502c2 | |||
| dae6896977 | |||
| c34eeec58d | |||
| ad60bbb2b2 | |||
| 0578e5a462 | |||
| 04222984f8 | |||
| 6832707e90 | |||
| 6b2ef5cd17 | |||
| 958adce478 | |||
| 99b0915d3b | |||
| 8ca2b21c98 | |||
| d9292786e1 | |||
| cc2f9b32c8 | |||
| cd579352bf | |||
| 9f1710f1ac | |||
| e642ec962c | |||
| ada19210a3 | |||
| bf0560bda9 | |||
| 151b08e0fe | |||
| 81b2f4a45f | |||
| 82551ad616 | |||
| caac5c2e59 | |||
| 6bd1dd9d26 | |||
| 4f27044aab | |||
| 0ddc991f5c | |||
| fa82b93853 | |||
| 69ff99fdcd | |||
| 5d802522a7 | |||
| 1769928079 | |||
| ed6ea06577 | |||
| 5ee10e990d | |||
| 3dbd2d813a | |||
| f5f7f00cd9 | |||
| abcc61e0af | |||
| f6bb18fd9a | |||
| 71eaf8969b | |||
| ca100c90fe | |||
| ffad94397d | |||
| 4dacaa4a83 | |||
| a7ea35aa67 | |||
| 1e3e76b6cc | |||
| 53ea6ad830 | |||
| 1b7624bf5c | |||
| ac60dc7fe1 | |||
| a4f1ee35d6 | |||
| a32c8669ca | |||
| ca2ca8de57 | |||
| f71b00a19e | |||
| 8f808cf86e | |||
| 7bab4bb048 | |||
| e17e4488bd | |||
| 257e200a25 | |||
| 47d4a7e004 | |||
| 7f89a594dd | |||
| 961644e6a8 | |||
| 8d6cd32b7b | |||
| ec79b67c77 | |||
| 32985bed7c | |||
| dae9ec464c | |||
| 6eaf93020d | |||
| 72c62eae5f | |||
| 0a995d5434 | |||
| ade3f7d988 | |||
| 0df25101d6 | |||
| e123aafdf0 | |||
| 5b143d33be | |||
| eb59b5a6cb | |||
| fbfc3ee37e | |||
| 3e1d223626 | |||
| 4f5b059f14 | |||
| 288ca110f6 | |||
| c2bd2196fc | |||
| 550c7ba3dc | |||
| e5b2f1601a | |||
| 9badee53de | |||
| beebf4742a | |||
| f89978ad7c | |||
| b3cf368d79 | |||
| c8525f06fc | |||
| 5db6b2c961 | |||
| 6247bae6c6 | |||
| 3610fb4930 | |||
| 71c4b40562 | |||
| ac65bc92df | |||
| f78c0be80a | |||
| 66233af7b6 | |||
| bf13d40972 | |||
| 989f4f430c | |||
| bb5b640359 | |||
| c060b71408 | |||
| 79e4937c65 | |||
| cd1d3c3df8 | |||
| 19d98e0c7d | |||
| 2b04c209ee | |||
| ae122b1cbd | |||
| 872db2be0e | |||
| 2dfdfed8a0 | |||
| c41d27156b | |||
| 91373a0d15 | |||
| 848a6438ae | |||
| 98175b2816 | |||
| 4167252eaf | |||
| f35f8e2242 | |||
| b87c21fc89 | |||
| e584b85afd | |||
| 09e56f9262 | |||
| cf069aa8aa | |||
| bf33700ecd | |||
| bc6ccb9878 | |||
| 82fbeae92b | |||
| cc5e8f6db8 | |||
| d54990da47 | |||
| b9f1d4294e | |||
| b28246f6ff | |||
| 3b5567a209 | |||
| fdcc405346 | |||
| 8994dabc22 | |||
| 02296f420d | |||
| 6a92ff93e1 | |||
| 6a84164add | |||
| f64ffa8c25 | |||
| bd56c983d6 | |||
| 084bbac8cc | |||
| 28943d36ce | |||
| b526ca6726 | |||
| e7bd944e08 | |||
| c3b6559a10 | |||
| 4be4b26cb7 | |||
| 2aed2c9fa7 | |||
| 9b61dd41e7 | |||
| f7bee5c815 | |||
| e0734387fb | |||
| f58f8b5c96 | |||
| b3f7aaccd0 | |||
| b91660ddb8 | |||
| 76c89fcadd | |||
| b9e41734c5 | |||
| 1088f06242 | |||
| 73e0225ee9 | |||
| 6c85da3a18 | |||
| 67fc426845 | |||
| 9804145cac | |||
| 2e94b9cfbb | |||
| 8294773e48 | |||
| cd813c6d4d | |||
| 38acae6e97 | |||
| a2dd48c386 | |||
| 126f6beeb4 | |||
| 58d1b2aa77 | |||
| f1579b229d | |||
| 7864875879 | |||
| 1dd422b64a | |||
| 06c8f8d885 | |||
| 5677c9bb3e | |||
| 512d77d582 | |||
| 7f0be2aa24 | |||
| edf309ebbe | |||
| 788f284b53 | |||
| 4b1d141f49 | |||
| 10c3b8c1cf | |||
| a7f37314b7 | |||
| cd711c48b2 | |||
| 378b3ef6f8 | |||
| c9944acbf9 | |||
| ca377cf1b9 | |||
| a31614e386 | |||
| f95903909f | |||
| b382a7f28f | |||
| 4cb6fa0a9c | |||
| d08b285adf | |||
| b27122acc2 | |||
| 934bb99c71 | |||
| 3f808cc044 | |||
| ec8a5e5386 | |||
| 215bf150a6 | |||
| 0ecdd98031 | |||
| 7b700ec8c8 | |||
| 7ca1da020f | |||
| 5157338ed9 | |||
| e206b54331 | |||
| 1d35662e6d | |||
| e656f638de | |||
| 145944cb94 | |||
| 094b7d9496 | |||
| e1fe7591f2 | |||
| 5629f26df7 | |||
| 9ba28043b5 | |||
| 24679788ed | |||
| 07c4353057 | |||
| 34e3494e70 | |||
| f75aa72732 | |||
| 340e39e387 | |||
| f4133ce4e5 | |||
| 6522d55b6f | |||
| 6ff518626c | |||
| fa82074167 | |||
| 75e9d49796 | |||
| 32c3b6bfd1 | |||
| 37b6cb4985 | |||
| aabeb2688f | |||
| 2f42a4888c | |||
| 3173c3b34e | |||
| 2d87d7d1ac | |||
| aab392774b | |||
| 6724e79164 | |||
| 03f48b3db6 | |||
| 4d251ad00e | |||
| 18e505930d | |||
| 4a8cfc7551 | |||
| bc32bc73aa | |||
| ab1091d5f2 | |||
| 1e15aaef56 | |||
| 51010a1807 | |||
| 7196a3b1db | |||
| cdc1fa12eb | |||
| f61528d46d | |||
| 1f0ae3ed0a | |||
| db986c19ea | |||
| 227578480d | |||
| befc402d34 | |||
| 444b0f0f62 | |||
| ccc00515fd | |||
| 781096e385 | |||
| 7940d8a6a7 | |||
| c0e3ecd6d2 | |||
| 23eca9cf68 | |||
| 437b76ff59 | |||
| f90a375593 | |||
| e7ef74e26e | |||
| cbae7af552 | |||
| eb24dc4a45 | |||
| 9bebc9512f | |||
| 5a2ba16f5c | |||
| ba5106e519 | |||
| d5ca2110f1 | |||
| 2c5e637b57 | |||
| 322d2a27d6 | |||
| 82e0d601fc | |||
| 78ac0f591d | |||
| b56155e7f3 | |||
| 382f66fb08 | |||
| 8354f6640c | |||
| c904fdddf6 | |||
| 558db8083c | |||
| e109e598c7 | |||
| 8db1b9d0a1 | |||
| 2382ad29d1 | |||
| 3e472d882a | |||
| 7f6bae561c | |||
| 105b8ce4c0 | |||
| 2cb8c1540e | |||
| 1cd981da4f | |||
| fca20841c2 | |||
| da31b5333e | |||
| bb78fb318e | |||
| 8aca27fa11 | |||
| 95c617e04b | |||
| 9a1f1da5d1 | |||
| 68d630a0c7 | |||
| 68d535ef44 | |||
| c6ed93860f | |||
| 0ffdf8ce0c | |||
| 8c0dd3d4df | |||
| ada7c780d5 | |||
| 288cc6c234 | |||
| 900edbfa48 | |||
| b2c3fc5d65 | |||
| 839b27c6cc | |||
| 34ad27fe83 | |||
| 1c3c975766 | |||
| 1cdc88614a | |||
| 31aa045c11 | |||
| a30c093502 | |||
| c7b07a95a6 | |||
| 27a09dc52c | |||
| 981f3c831e | |||
| 44c33f01f3 | |||
| 33170081f1 | |||
| 71face8540 | |||
| bfbc0b32c6 | |||
| 6a417b8600 | |||
| d3ea50113c | |||
| 34aad515c8 | |||
| ed6e9075d3 | |||
| 992e5c3d34 | |||
| b69692a2d8 | |||
| a64a84433d | |||
| aa1e62d0db | |||
| 497bc83124 | |||
| 3738e6fa80 | |||
| 0023cd2b9d | |||
| 041e294716 | |||
| 9621667874 | |||
| 8c755c3b6d | |||
| ba81163997 | |||
| 0d243f2a54 | |||
| 88f6ba3281 | |||
| 512368e34a | |||
| 473f51cfd9 | |||
| a4c402a756 | |||
| 550d97eb58 | |||
| fbbe1fbac6 | |||
| 01c184b8f3 | |||
| ad5a35c21b | |||
| 5ae9f26a5a | |||
| 377d10bd14 | |||
| 52ce14d31f | |||
| 81dabf24a8 | |||
| 423330263b | |||
| caf7ff4456 | |||
| f525c0be8b | |||
| 983a40a8bb | |||
| fdc5df6f54 | |||
| 3b05cd4555 | |||
| d5d214ac7f | |||
| fd84857f64 | |||
| 8aada19dfc | |||
| 9aa95b0e6a | |||
| d0a7a2769d | |||
| 00b69c2d27 | |||
| 4c82229898 | |||
| c8d70e2437 | |||
| 30172b4947 | |||
| a4d577b379 | |||
| 7b203b7694 | |||
| 4fb8142a0e | |||
| a02c86b4dd | |||
| 3809458456 | |||
| d3231cb436 | |||
| 435b502a6e | |||
| 29fc5772c4 | |||
| 2358ca527b | |||
| 8cf97f8661 | |||
| e2603fefb8 | |||
| b53d79983c | |||
| 9915912f7f | |||
| d1b649f1ef | |||
| ac19b519ed | |||
| a1074b3efe | |||
| 00294e1bc6 | |||
| 88787bce1d | |||
| 932b51cedd | |||
| 7c7adf81fc | |||
| 67ef8f666a | |||
| efbe854448 | |||
| b3942e157e | |||
| cd4a72a28d | |||
| 6ac485a953 | |||
| 4c21ce9eba | |||
| ce77eb9410 | |||
| 30513d1cb6 | |||
| 1f69c4a892 | |||
| 7b623fca0b | |||
| 238dfc8ac3 | |||
| 45186834a0 | |||
| f857311d13 | |||
| 46cdd59577 | |||
| 2010f04c17 | |||
| 69e1d23e1e | |||
| d67cc21b78 | |||
| e18227b04a | |||
| 7b89386553 | |||
| da833b0aee | |||
| 5d2965b7d7 | |||
| a0231b7c25 | |||
| 124776ebd5 | |||
| b7d309860e | |||
| dc0f7ccf8b | |||
| d3d547e057 | |||
| 12913d17ba | |||
| 80f63a3966 | |||
| 367cb8ce8c | |||
| 54ed913f34 | |||
| 9206b3d7ec | |||
| ed0de3e4b8 | |||
| 2ad1bc7afe | |||
| 7fdaaf48ef | |||
| 067fa2255b | |||
| 9076325677 | |||
| 97a3d6d995 | |||
| 579d7a63b2 | |||
| c9f9d5b397 | |||
| 0c73026844 | |||
| 6a854c7a2b | |||
| e7eea5a520 | |||
| a12934d3ec | |||
| 3bcb8c75da | |||
| 5e5c8e091e | |||
| c9e2d644e7 | |||
| 7734e9a291 | |||
| 6224a9f620 | |||
| 085b7b2d6c | |||
| 4da1f667e9 | |||
| 556ef7f714 | |||
| 83481ceb49 | |||
| 185cc19f92 | |||
| 45f90bcbba | |||
| b0ccfc565a | |||
| ba59b78a9c | |||
| cbc40128eb | |||
| f0b2da72a8 | |||
| f2b20fe491 | |||
| 40932d7a05 | |||
| 84683fa271 | |||
| 067678262a | |||
| 09545c0a94 | |||
| dd5ede4440 | |||
| 8c32b08a86 | |||
| 410886950a | |||
| e38be640e6 | |||
| c1e37bf71b | |||
| 2344192a55 | |||
| bffddd9a05 | |||
| d84cef76eb | |||
| 37dfa60037 | |||
| 1bc3b5e71b | |||
| 02ed8a1fbe | |||
| 2092a6fa7d | |||
| c9d3ecf016 | |||
| fdcf64d3c6 | |||
| 578087e56c | |||
| fa253f1a70 | |||
| 9605c1256e | |||
| 0ccd8769fb | |||
| cb944d5818 | |||
| d46d490c27 | |||
| 04f50ad9d1 | |||
| 60c68df6d1 | |||
| 009439caeb | |||
| bc55d13070 | |||
| d88c8666a1 | |||
| 4fc5c23bb6 | |||
| 9f9704dca6 | |||
| 8eafe5eaea | |||
| 4c0d93f4b2 | |||
| 14b7899d10 | |||
| 09972e716c | |||
| 36a08630e8 | |||
| 2c2b560f48 | |||
| 042c3419fa | |||
| 82cabf53a3 | |||
| 314cfade02 | |||
| 985b4a2b19 | |||
| f4d97e4fc2 | |||
| f1042e86f0 | |||
| 7c4033acd4 | |||
| d59def4730 | |||
| 0c7d9effce | |||
| dd3b4a01f8 | |||
| a0597c6b75 | |||
| e92694b6fe | |||
| 842b0fd402 | |||
| 974dfd4971 | |||
| 3ee696a63d | |||
| 72c2b68dc9 | |||
| 14ecab5be2 | |||
| deb6c1c6b4 | |||
| 565c1efa65 | |||
| 2b25b7d2e1 | |||
| 6c4dbe23eb | |||
| 21f5d50fa5 | |||
| bf3e05215c | |||
| ad9776353e | |||
| 75e6e14516 | |||
| 110f59a33e | |||
| 2e3b969ec0 | |||
| da317197dd | |||
| 7539bbc6a6 | |||
| 9cf4759493 | |||
| 41c5dd45b9 | |||
| fc6485d277 | |||
| 78a141d768 | |||
| c320ca8edd | |||
| 58047c6f04 | |||
| cb080f32e3 | |||
| 2c0f58203c | |||
| 2ff4857678 | |||
| 91e876750e | |||
| 08b2d845d6 | |||
| 2ae889052c | |||
| 51f0b5f7f6 | |||
| fde71262e0 | |||
| 243137143c | |||
| b2496bb07f | |||
| 44607e07d3 | |||
| 67c4637ccf | |||
| aa0ca5ebb7 | |||
| 59fff4a01a | |||
| 29f1d47e73 | |||
| cf797aa856 | |||
| 24700c346b | |||
| d366ccc4e3 | |||
| 870c37481e | |||
| 86222a3dab | |||
| fe743b798d | |||
| 913df14da3 | |||
| 8a69e0e20e | |||
| 4c8dd12ef3 | |||
| 256a2d29dc | |||
| c45d398e6f | |||
| 011e612d92 | |||
| 7e1837676a | |||
| 2880e21e3d | |||
| 407b5537db | |||
| 4ea48fb35c | |||
| e31498bdcb | |||
| 91dd8f7aa6 | |||
| d01f66b039 | |||
| cc01223f3b | |||
| 306923da82 | |||
| 3243158336 | |||
| b21f0f9d17 | |||
| 45cbc4991d | |||
| 932c6b7461 | |||
| eaa92d4437 | |||
| 0630d4537a | |||
| 538fab93cd | |||
| ce26b16268 | |||
| 1918aa1b80 | |||
| 6e1fc61f0f | |||
| aa375dca9f | |||
| 433c4a4923 | |||
| ef533d25fb | |||
| b260782357 | |||
| 741429a4cd | |||
| aff404571b | |||
| 467a96a541 | |||
| 8108ac841d | |||
| afe74f7a96 | |||
| 09b95e36ab | |||
| 85ac82d228 | |||
| 1e57b1ee63 | |||
| e152f29502 | |||
| c786e757fa | |||
| cefd56ee35 | |||
| 7ca9934fe7 | |||
| 0408efc6d0 | |||
| 449d1bce02 | |||
| 1a6fcad4c9 | |||
| 56534cd577 | |||
| d88506dda4 | |||
| 9cdea30b4f | |||
| 76abd0c881 | |||
| 5b19b93082 | |||
| 75404d041b | |||
| bf3b79efb8 | |||
| 9a5b1554b4 | |||
| a4ce74c14a | |||
| 3b2005e1db | |||
| af8486de49 | |||
| 4c3aac51e1 | |||
| bc1bdecebf | |||
| 022bcc701a | |||
| c53dc466b1 | |||
| 3d09e592a8 | |||
| fcf2e3d7fc | |||
| 58b218d7ae | |||
| 7ff7a638b6 | |||
| 686006a220 | |||
| 98fd089fc9 | |||
| 249824c3bf | |||
| 64862d106e | |||
| b3a0d01e45 | |||
| 75e94309e8 | |||
| 233df6f5c4 | |||
| 18016a5e62 | |||
| 649550f27e | |||
| 62467a834a | |||
| 6469038b14 | |||
| 815079de8e | |||
| 18a88fcccc | |||
| d1ca7df84d | |||
| 96b23621c1 | |||
| c36ac98d01 | |||
| 4896d0c2dd | |||
| bb392af434 | |||
| 5d98d56089 | |||
| 73b35cca7f | |||
| 5095e96606 | |||
| cf58b9c4ca | |||
| 4797dad3ec | |||
| 6dd5e52823 | |||
| c11de33dad | |||
| 33e0602e59 | |||
| a1a2aaadb9 | |||
| 1298a400e8 | |||
| ad4a9dc817 | |||
| b9986454fe | |||
| c5932e5dac | |||
| 20579c0fae | |||
| 95460fc513 | |||
| 326fcc8b9f | |||
| e64330910b | |||
| e489ad7a21 | |||
| f256ebe4df | |||
| f8ece6e17f | |||
| abfcdcdf27 | |||
| e497f33491 | |||
| baaa2b24da | |||
| b4e5c03306 | |||
| 3194039c0e | |||
| 4f4d427ac2 | |||
| 1e3698393f | |||
| baeded2569 | |||
| 3e1c76cf3a | |||
| cfa134d247 | |||
| 35b7a05507 | |||
| 1867c258bd | |||
| cb3e73e4c8 | |||
| b1340f9d55 | |||
| 44bbca78d7 | |||
| 60808bd4c7 | |||
| fc542144c4 | |||
| eb5741ad42 | |||
| 145c2ff648 | |||
| 415f19474d | |||
| 89003c4082 | |||
| 60bcef000e | |||
| 847f883232 | |||
| 325f679f32 | |||
| e3f7ff65e7 | |||
| 7a8987dac5 | |||
| cabaf4eff3 | |||
| a1fc18c030 | |||
| 9798b2fb00 | |||
| 4078052f09 | |||
| bd2107e30a | |||
| 9b0c4bab36 | |||
| 41bf5612f5 | |||
| a2769032ca | |||
| f17f1d4608 | |||
| 1c1bb0bbf2 | |||
| e0cc5f259a | |||
| 73aa6cfdf7 | |||
| 27b78c73ca | |||
| b02fd288b2 | |||
| ff7424f491 | |||
| d93bf4da85 | |||
| 036ca94c25 | |||
| ef001d98ef | |||
| 5f671cb4c3 | |||
| bd02164cf9 | |||
| 46fb056749 | |||
| dd6a3a02cb | |||
| a7e3eba66f | |||
| fbb5bd4cef | |||
| 80fcc3ed1c | |||
| c386c43ca3 | |||
| f26d790718 | |||
| 0f657bdc52 | |||
| 3fd1fb63ef | |||
| 925d2f1908 | |||
| 8f58a51358 | |||
| 2079e43bee | |||
| e29d4358ef | |||
| 8cbc424975 | |||
| dd66fd2b01 | |||
| 0f465ab533 | |||
| 23a7cbc88b | |||
| 426a5c3625 | |||
| ddee88d0ff | |||
| 823ab79633 | |||
| 6116ca8cd7 | |||
| 2bc3fbba0c | |||
| 3f1fc7425a | |||
| 01ba927040 | |||
| 103bd17ac5 | |||
| ce69f7f754 | |||
| 624a1e4711 | |||
| 372bf0890b | |||
| 5204ff5c3f | |||
| 0cc6b383d7 | |||
| 28e0750847 | |||
| 582cf78798 | |||
| 0034b09ceb | |||
| 72bac73067 | |||
| 68f11149d8 | |||
| 72f4880425 | |||
| aa2cd2c43d | |||
| 9ddc35220b | |||
| a5255270c3 | |||
| 0ee349b553 | |||
| fa63e710c7 | |||
| 2a0309a646 | |||
| 324960a95c | |||
| f1fc0510df | |||
| bf21481dde | |||
| fb30ee92ee | |||
| 221d388cc5 | |||
| 3132a933b6 | |||
| df5dafaa5b | |||
| ab5bbf5ae3 | |||
| 3bb8e2c9a2 | |||
| e784c6b998 | |||
| 9a0f3bdbe5 | |||
| c7c9851036 | |||
| 3c818bdb42 | |||
| 6dd94dbe94 | |||
| 0e74d797ce | |||
| 55ef66edf4 | |||
| 5e5630a478 | |||
| d3d6bb13fb | |||
| 24b0205f58 | |||
| c5cffcd0cd | |||
| 682b55bc07 | |||
| 9726ad676d | |||
| eb5cb5e528 | |||
| 2cbeedad09 | |||
| 2c85529bfc | |||
| e97f802b2d | |||
| 6e650f56a1 | |||
| 3f50c148fd | |||
| 8c01b8022c | |||
| 99d01a5e3d | |||
| d07efb31c5 | |||
| 978b45f399 | |||
| c5b4b11d7f | |||
| 8ae5ff2009 | |||
| 511627445e | |||
| f0ef37233e | |||
| 7551a34032 | |||
| 01a55941f5 | |||
| 8d7aa9de71 | |||
| 68c4421b6d | |||
| aea94362c9 | |||
| 7206ce4ce1 | |||
| 96f6a7596f | |||
| 84bee4bd5c | |||
| fc66dee76d | |||
| 6609cdf019 | |||
| 16366ee8bb | |||
| 528dbcac7d | |||
| cd7b6f0857 | |||
| 68ad4e3a8d | |||
| 4004f144f3 | |||
| 66818e5b63 | |||
| 222a9dc350 | |||
| cbdc4ad5a5 | |||
| 016e3676e7 | |||
| 64ea24d0b3 | |||
| df76e5af26 | |||
| 09ccc9c8f7 | |||
| 69196a9bc7 | |||
| 2acba47d9b | |||
| 9c485d9e25 | |||
| fa9ee08121 | |||
| 347eeebe3b | |||
| 18fd4a8331 | |||
| 132a132100 | |||
| 1e60f87bb3 | |||
| 9705b90bcf | |||
| 3aec49e56f | |||
| c64612802b | |||
| 9a7c3a0042 | |||
| b197a5ccfd | |||
| c81081fece | |||
| a94eee4456 | |||
| f2e9f2a3be | |||
| 1f1542afa9 | |||
| 96912550c8 | |||
| 2fc6944c5e | |||
| 5fe6bf29d6 | |||
| d4b62d4641 | |||
| ecf67814f1 | |||
| 750f4cabfa | |||
| 06a760d6e8 | |||
| da7512215f | |||
| af69a6aded | |||
| 7bd3630067 | |||
| 96663699b2 | |||
| 18572e3384 | |||
| 86bfb6dba7 | |||
| 5f0ec3935a | |||
| c222f47992 | |||
| 170eb35079 | |||
| b37d82791e | |||
| 3127e975fb | |||
| 4001ea1266 | |||
| 5c89a29c22 | |||
| 59a0192fb9 | |||
| 83609791d2 | |||
| 0974c9bc5c | |||
| d2643128f7 | |||
| c5c06209ec | |||
| 3ea7b94523 | |||
| 51ef828f10 | |||
| df450aa567 | |||
| bbe5f9de7d | |||
| 81763c58a0 | |||
| edaae198e7 | |||
| 936db119ed | |||
| e66faf4809 | |||
| 630eb5b5ce | |||
| 4e94951bb1 | |||
| 7a8a48d51e | |||
| 32eb0da808 | |||
| 6d0e3d3724 | |||
| 02798ecabe | |||
| 813f249f02 | |||
| da02cb4b27 | |||
| c09503ddd6 | |||
| 2b83503227 | |||
| 7b98a65ae6 | |||
| b5b57e301e | |||
| 54cacf008f | |||
| 58fd57ff1d | |||
| 87a0c076af | |||
| d4e6194570 | |||
| 07934cc237 | |||
| 69d765f5a5 | |||
| 8027a72461 | |||
| d75ab55f10 | |||
| d1adb9b403 | |||
| b8bfa46a18 | |||
| 1475847a14 | |||
| fead53ba78 | |||
| ebc73f2828 | |||
| d06e824006 | |||
| 62b06ba23d | |||
| 5fd24ec02e | |||
| 874f7c292a | |||
| 92e793d91a | |||
| bf53e0c70b | |||
| dd7c9ad870 | |||
| 9aa1519f08 | |||
| f8ef146f03 | |||
| fa0050db08 | |||
| cd9d06fb8d | |||
| ebd8c669ef | |||
| 70755e819e | |||
| edce722eaa | |||
| 57e729e874 | |||
| de0526f668 | |||
| 5ecf3e0aaf | |||
| 97eb97b5a4 | |||
| 3adf0ffda8 | |||
| ad388d25a8 | |||
| cbe94391eb | |||
| 994fc655b7 | |||
| 3f9b7ab9f5 | |||
| ad34c0df0f | |||
| f218f9c24d | |||
| 0794e7446e | |||
| b7ee940a82 | |||
| 9ddac56311 | |||
| 1a51b9f872 | |||
| 42f5e7c52a | |||
| a3a3ee4e6f | |||
| 87054a57ab | |||
| c9d6ff530b | |||
| a2d2acb4c8 | |||
| 2e0e017610 | |||
| 1f18adb245 | |||
| bb354e6b2d | |||
| ff39141a49 | |||
| 8a1f938e6f | |||
| 078da31903 | |||
| 1a401252b5 | |||
| f35ec461fc | |||
| 289b5191d5 | |||
| c6db21313c | |||
| a7d59688fb | |||
| 458e63a2c6 | |||
| e8c23ff989 | |||
| cd8249903f | |||
| 0f8cafe2d1 | |||
| 5340a30d01 | |||
| 89ce62a316 | |||
| c3f05b09a0 | |||
| cf6bbcb493 | |||
| 80ea3af1a0 | |||
| 9dd02d85ca | |||
| f7b3ba82c3 | |||
| 619ae268c3 | |||
| d14e98d924 | |||
| 9597a095f2 | |||
| 263a870ee1 | |||
| 8bddb73512 | |||
| f967e51f38 | |||
| 43f3d9e699 | |||
| b25cfab9a0 | |||
| 4b657d3292 | |||
| d697dc01b4 | |||
| a991f7d508 | |||
| 7a3a83e3b8 | |||
| c32a7c7c0c | |||
| 2118d0565c | |||
| 899136b857 | |||
| c9f09a4fe8 | |||
| d45cbe70f5 | |||
| 8a579408f3 | |||
| 46fa98ccad | |||
| aa1e77a19c | |||
| 5959564f94 | |||
| f33e033e27 | |||
| 482cdc494e | |||
| 20410b2fda | |||
| 12664ddda5 | |||
| 241ad7b301 | |||
| d85c47d6ad | |||
| ef725feafc | |||
| d907be7dc7 | |||
| d53575a5f0 | |||
| 61af633256 | |||
| ac2f3f7fee | |||
| cf5f000d21 | |||
| 3de2b1eafb | |||
| b844b99ad3 | |||
| c3cf54dda4 | |||
| 36f5303578 | |||
| 9a228348d2 | |||
| bd82872211 | |||
| 405eb8e396 | |||
| 65097ca0af | |||
| 1d967acb45 | |||
| 0bd1ff4346 | |||
| 310aca88c9 | |||
| a732900efc | |||
| d848800e88 | |||
| 730e9592e9 | |||
| 1fe554bac3 | |||
| 615e4a5401 | |||
| 3db0cafdf1 | |||
| 526de822d5 | |||
| 56fe4c297c | |||
| 47de8821d3 | |||
| 5984499e47 | |||
| ca47e176af | |||
| 78f4590b60 | |||
| 2f7024987e | |||
| 6cd40a5bfe | |||
| aba8d6ee00 | |||
| 2a0596bc48 | |||
| f12141170a | |||
| cfd3219f58 | |||
| a1b2b8606e | |||
| ad9f1aa679 | |||
| 889e662eae | |||
| ef68eb28d8 | |||
| 259abd8953 | |||
| f645eb6954 | |||
| f4923cb8bc | |||
| b640b19cc0 | |||
| dc71af0a71 | |||
| 4d29e91be8 | |||
| 91445c7bc8 | |||
| 5950f555a1 | |||
| a4e2b26856 | |||
| 973f5dc581 | |||
| c994223d56 | |||
| 869579a702 | |||
| c0efe92d8b | |||
| d9fa1c05ad | |||
| 2de197bdd4 | |||
| 869e829b85 | |||
| 8f37be38eb | |||
| 8082ad7950 | |||
| 1e4ce295ae | |||
| ce1917fcf2 | |||
| e512f76a89 | |||
| 898cdf033e | |||
| 0f3f3c86ec | |||
| b278557935 | |||
| 8ceffbf315 | |||
| d93d2d74fd | |||
| d0169e1b0f | |||
| 08fb75c72e | |||
| 91b361ae89 | |||
| e20c92bb61 | |||
| 32c9eff2ff | |||
| 4ca5d40adc | |||
| 9279b9f83d | |||
| ee77fdb5de | |||
| 996357e480 | |||
| 2a622d704a | |||
| 9c749713f6 | |||
| 022c5c6944 | |||
| f8fcca100b | |||
| 06bfb51963 | |||
| 408e560015 | |||
| 402d378360 | |||
| 9e764e7b10 | |||
| 33fc1e2e86 | |||
| eba17173d3 | |||
| 635b897246 | |||
| 4068f4b5b5 | |||
| 47831430cc | |||
| 65c08928c2 | |||
| ba214dffbe | |||
| eed11ebee9 | |||
| 300acb8347 | |||
| d91457d529 | |||
| fbf2564554 | |||
| d1d49397e7 | |||
| 9c93636d84 | |||
| e5d7ed0c53 | |||
| ad0d567e1c | |||
| bf0d97d786 | |||
| a655eb3025 | |||
| 1543914c04 | |||
| 61fed92c7e | |||
| 80c751e7f6 | |||
| e1a5c2f0a1 | |||
| fd3a62a122 | |||
| 07064cb1d4 | |||
| 2f1e8e8f54 | |||
| 68d37809b9 | |||
| 5dba257506 | |||
| 187e32997c | |||
| b55ed6ef8a | |||
| 2f385183f3 | |||
| 84c35c374a | |||
| 8c38ee7007 | |||
| b6087a6bee | |||
| 23c1b10a4c | |||
| a115ac46b5 | |||
| 73001445fb | |||
| 6d70198b17 | |||
| f962f426bc | |||
| 11d8a091c6 | |||
| 365801fedd | |||
| 4db72e57f6 | |||
| 0c6f998554 | |||
| e7c7c5e822 | |||
| 8c3230d8c1 | |||
| 2c5718809b | |||
| 82c49d3260 | |||
| 74fa1d123c | |||
| a2a40bcd0d | |||
| ccb1aabcca | |||
| 36e7670045 | |||
| 5886aa496e | |||
| 8d9b6721e7 | |||
| b12e87f942 | |||
| 5dbf854553 | |||
| 970d6d0776 | |||
| 628ec6c17b | |||
| 3682e33f9f | |||
| 0aa38d16f5 | |||
| faef77c0d6 | |||
| dba4d9dec6 | |||
| 32b4c63f02 | |||
| 4fb8e329fd | |||
| 328841d002 | |||
| d427e5cfda | |||
| 42bb201fd6 | |||
| 59d6bb4c86 | |||
| b7dcc003dc | |||
| d34be24bb1 | |||
| b5cbe8eeb3 | |||
| df04dffade | |||
| a60731247f | |||
| ac79799403 | |||
| dde1fa18c9 | |||
| 0240402c46 | |||
| 55509c2114 | |||
| 101418096f | |||
| 5ce4627a7e | |||
| 7af553ea30 | |||
| 2c9b8ea2b0 | |||
| d003f3ea39 | |||
| 6c6f7fe8a8 | |||
| 2339d59f92 | |||
| 1b875a0ef3 | |||
| eb881ed006 | |||
| 46d4359450 | |||
| 81b979f2a8 | |||
| 371d04d39b | |||
| 0c0c2015c5 | |||
| 82d24f7aac |
@ -1,9 +1,14 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 250 MB
|
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 400 MiB
|
||||||
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 250))
|
# Note that we have 400 MiB quota, please use it wisely.
|
||||||
|
# See https://github.com/pypi/support/issues/3792 .
|
||||||
|
# Please also sync the value with the one in Dockerfile.
|
||||||
|
VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400))
|
||||||
|
|
||||||
|
|
||||||
def print_top_10_largest_files(zip_file):
|
def print_top_10_largest_files(zip_file):
|
||||||
|
|||||||
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
|||||||
@ -4,8 +4,8 @@ tasks:
|
|||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
metrics:
|
metrics:
|
||||||
- name: "exact_match,strict-match"
|
- name: "exact_match,strict-match"
|
||||||
value: 0.233
|
value: 0.231
|
||||||
- name: "exact_match,flexible-extract"
|
- name: "exact_match,flexible-extract"
|
||||||
value: 0.236
|
value: 0.22
|
||||||
limit: 1000
|
limit: 1000
|
||||||
num_fewshot: 5
|
num_fewshot: 5
|
||||||
|
|||||||
@ -0,0 +1,11 @@
|
|||||||
|
# bash ./run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM -b "auto" -t 2
|
||||||
|
model_name: "nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-chnl_wts_per_tok_dyn_act_fp8-BitM"
|
||||||
|
tasks:
|
||||||
|
- name: "gsm8k"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,strict-match"
|
||||||
|
value: 0.6353
|
||||||
|
- name: "exact_match,flexible-extract"
|
||||||
|
value: 0.637
|
||||||
|
limit: null
|
||||||
|
num_fewshot: null
|
||||||
@ -1,3 +1,4 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
"""
|
"""
|
||||||
LM eval harness on model to compare vs HF baseline computed offline.
|
LM eval harness on model to compare vs HF baseline computed offline.
|
||||||
Configs are found in configs/$MODEL.yaml
|
Configs are found in configs/$MODEL.yaml
|
||||||
@ -12,6 +13,7 @@ from pathlib import Path
|
|||||||
|
|
||||||
import lm_eval
|
import lm_eval
|
||||||
import numpy
|
import numpy
|
||||||
|
import pytest
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
RTOL = 0.05
|
RTOL = 0.05
|
||||||
@ -45,6 +47,10 @@ def test_lm_eval_correctness():
|
|||||||
eval_config = yaml.safe_load(
|
eval_config = yaml.safe_load(
|
||||||
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
|
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
if eval_config[
|
||||||
|
"model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform": #noqa: E501
|
||||||
|
pytest.skip("FBGEMM is currently failing on main.")
|
||||||
|
|
||||||
# Launch eval requests.
|
# Launch eval requests.
|
||||||
results = launch_lm_eval(eval_config)
|
results = launch_lm_eval(eval_config)
|
||||||
|
|
||||||
|
|||||||
@ -1,15 +1,13 @@
|
|||||||
# vLLM benchmark suite
|
# vLLM benchmark suite
|
||||||
|
|
||||||
|
|
||||||
## Introduction
|
## Introduction
|
||||||
|
|
||||||
This directory contains two sets of benchmark for vllm.
|
This directory contains two sets of benchmark for vllm.
|
||||||
|
|
||||||
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
|
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
|
||||||
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
|
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
|
||||||
|
|
||||||
|
See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
|
||||||
See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
|
|
||||||
|
|
||||||
|
|
||||||
## Performance benchmark quick overview
|
## Performance benchmark quick overview
|
||||||
|
|
||||||
@ -19,17 +17,14 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performan
|
|||||||
|
|
||||||
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
|
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
|
||||||
|
|
||||||
|
|
||||||
## Nightly benchmark quick overview
|
## Nightly benchmark quick overview
|
||||||
|
|
||||||
**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
|
**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
|
||||||
|
|
||||||
**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
|
**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
|
||||||
|
|
||||||
**Benchmarking Duration**: about 3.5hrs.
|
**Benchmarking Duration**: about 3.5hrs.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Trigger the benchmark
|
## Trigger the benchmark
|
||||||
|
|
||||||
Performance benchmark will be triggered when:
|
Performance benchmark will be triggered when:
|
||||||
@ -39,16 +34,11 @@ Performance benchmark will be triggered when:
|
|||||||
Nightly benchmark will be triggered when:
|
Nightly benchmark will be triggered when:
|
||||||
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
|
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Performance benchmark details
|
## Performance benchmark details
|
||||||
|
|
||||||
|
|
||||||
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
||||||
|
|
||||||
|
### Latency test
|
||||||
#### Latency test
|
|
||||||
|
|
||||||
Here is an example of one test inside `latency-tests.json`:
|
Here is an example of one test inside `latency-tests.json`:
|
||||||
|
|
||||||
@ -68,23 +58,25 @@ Here is an example of one test inside `latency-tests.json`:
|
|||||||
```
|
```
|
||||||
|
|
||||||
In this example:
|
In this example:
|
||||||
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
|
||||||
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
|
||||||
|
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
|
||||||
|
|
||||||
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
|
||||||
|
|
||||||
WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
|
WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
|
||||||
|
|
||||||
|
### Throughput test
|
||||||
|
|
||||||
#### Throughput test
|
|
||||||
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
|
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
|
||||||
|
|
||||||
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
|
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
|
||||||
|
|
||||||
#### Serving test
|
### Serving test
|
||||||
|
|
||||||
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
|
||||||
|
|
||||||
```
|
```json
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||||
@ -109,6 +101,7 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t
|
|||||||
```
|
```
|
||||||
|
|
||||||
Inside this example:
|
Inside this example:
|
||||||
|
|
||||||
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
|
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
|
||||||
- The `server-parameters` includes the command line arguments for vLLM server.
|
- The `server-parameters` includes the command line arguments for vLLM server.
|
||||||
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
|
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
|
||||||
@ -118,36 +111,33 @@ The number of this test is less stable compared to the delay and latency benchma
|
|||||||
|
|
||||||
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
|
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
|
||||||
|
|
||||||
#### Visualizing the results
|
### Visualizing the results
|
||||||
|
|
||||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
|
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
|
||||||
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
||||||
If you do not see the table, please wait till the benchmark finish running.
|
If you do not see the table, please wait till the benchmark finish running.
|
||||||
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
||||||
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
|
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## Nightly test details
|
## Nightly test details
|
||||||
|
|
||||||
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
|
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
|
||||||
|
|
||||||
|
### Workflow
|
||||||
|
|
||||||
#### Workflow
|
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
|
||||||
|
|
||||||
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
|
|
||||||
- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
|
- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
|
||||||
- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
|
- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
|
||||||
- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
|
- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
|
||||||
|
|
||||||
#### Nightly tests
|
### Nightly tests
|
||||||
|
|
||||||
In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
|
In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
|
||||||
|
|
||||||
#### Docker containers
|
### Docker containers
|
||||||
|
|
||||||
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
|
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
|
||||||
|
|
||||||
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
|
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
|
||||||
|
|
||||||
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
|
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
|
||||||
|
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
steps:
|
steps:
|
||||||
- label: "Wait for container to be ready"
|
- label: "Wait for container to be ready"
|
||||||
|
key: wait-for-container-image
|
||||||
agents:
|
agents:
|
||||||
queue: A100
|
queue: A100
|
||||||
plugins:
|
plugins:
|
||||||
@ -9,13 +10,18 @@ steps:
|
|||||||
- image: badouralix/curl-jq
|
- image: badouralix/curl-jq
|
||||||
command:
|
command:
|
||||||
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
||||||
|
- label: "Cleanup H100"
|
||||||
- wait
|
agents:
|
||||||
|
queue: H100
|
||||||
|
depends_on: ~
|
||||||
|
command: docker system prune -a --volumes --force
|
||||||
|
|
||||||
- label: "A100"
|
- label: "A100"
|
||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||||
agents:
|
agents:
|
||||||
queue: A100
|
queue: A100
|
||||||
|
depends_on: wait-for-container-image
|
||||||
|
if: build.branch == "main"
|
||||||
plugins:
|
plugins:
|
||||||
- kubernetes:
|
- kubernetes:
|
||||||
podSpec:
|
podSpec:
|
||||||
@ -49,6 +55,8 @@ steps:
|
|||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||||
agents:
|
agents:
|
||||||
queue: H200
|
queue: H200
|
||||||
|
depends_on: wait-for-container-image
|
||||||
|
if: build.branch == "main"
|
||||||
plugins:
|
plugins:
|
||||||
- docker#v5.12.0:
|
- docker#v5.12.0:
|
||||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||||
@ -73,7 +81,8 @@ steps:
|
|||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||||
agents:
|
agents:
|
||||||
queue: H100
|
queue: H100
|
||||||
depends_on: block-h100
|
depends_on: wait-for-container-image
|
||||||
|
if: build.branch == "main"
|
||||||
plugins:
|
plugins:
|
||||||
- docker#v5.12.0:
|
- docker#v5.12.0:
|
||||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
||||||
@ -89,3 +98,87 @@ steps:
|
|||||||
environment:
|
environment:
|
||||||
- VLLM_USAGE_SOURCE
|
- VLLM_USAGE_SOURCE
|
||||||
- HF_TOKEN
|
- HF_TOKEN
|
||||||
|
|
||||||
|
# Premerge benchmark
|
||||||
|
- label: "A100"
|
||||||
|
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
depends_on: wait-for-container-image
|
||||||
|
if: build.branch != "main"
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
priorityClassName: perf-benchmark
|
||||||
|
containers:
|
||||||
|
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||||
|
command:
|
||||||
|
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: 8
|
||||||
|
volumeMounts:
|
||||||
|
- name: devshm
|
||||||
|
mountPath: /dev/shm
|
||||||
|
env:
|
||||||
|
- name: VLLM_USAGE_SOURCE
|
||||||
|
value: ci-test
|
||||||
|
- name: HF_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
nodeSelector:
|
||||||
|
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
||||||
|
volumes:
|
||||||
|
- name: devshm
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
|
||||||
|
- label: "H200"
|
||||||
|
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||||
|
agents:
|
||||||
|
queue: H200
|
||||||
|
depends_on: wait-for-container-image
|
||||||
|
if: build.branch != "main"
|
||||||
|
plugins:
|
||||||
|
- docker#v5.12.0:
|
||||||
|
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||||
|
command:
|
||||||
|
- bash
|
||||||
|
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||||
|
mount-buildkite-agent: true
|
||||||
|
propagate-environment: true
|
||||||
|
ipc: host
|
||||||
|
gpus: 4,5,6,7
|
||||||
|
volumes:
|
||||||
|
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
||||||
|
environment:
|
||||||
|
- VLLM_USAGE_SOURCE
|
||||||
|
- HF_TOKEN
|
||||||
|
|
||||||
|
#- block: "Run H100 Benchmark"
|
||||||
|
#key: block-h100
|
||||||
|
#depends_on: ~
|
||||||
|
|
||||||
|
- label: "H100"
|
||||||
|
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
||||||
|
agents:
|
||||||
|
queue: H100
|
||||||
|
depends_on: wait-for-container-image
|
||||||
|
if: build.branch != "main"
|
||||||
|
plugins:
|
||||||
|
- docker#v5.12.0:
|
||||||
|
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
||||||
|
command:
|
||||||
|
- bash
|
||||||
|
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
||||||
|
mount-buildkite-agent: true
|
||||||
|
propagate-environment: true
|
||||||
|
ipc: host
|
||||||
|
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
|
||||||
|
volumes:
|
||||||
|
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
||||||
|
environment:
|
||||||
|
- VLLM_USAGE_SOURCE
|
||||||
|
- HF_TOKEN
|
||||||
|
|||||||
@ -9,20 +9,19 @@ This file contains the downloading link for benchmarking results.
|
|||||||
|
|
||||||
Please download the visualization scripts in the post
|
Please download the visualization scripts in the post
|
||||||
|
|
||||||
|
|
||||||
## Results reproduction
|
## Results reproduction
|
||||||
|
|
||||||
- Find the docker we use in `benchmarking pipeline`
|
- Find the docker we use in `benchmarking pipeline`
|
||||||
- Deploy the docker, and inside the docker:
|
- Deploy the docker, and inside the docker:
|
||||||
- Download `nightly-benchmarks.zip`.
|
- Download `nightly-benchmarks.zip`.
|
||||||
- In the same folder, run the following code
|
- In the same folder, run the following code:
|
||||||
```
|
|
||||||
export HF_TOKEN=<your HF token>
|
```console
|
||||||
apt update
|
export HF_TOKEN=<your HF token>
|
||||||
apt install -y git
|
apt update
|
||||||
unzip nightly-benchmarks.zip
|
apt install -y git
|
||||||
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
unzip nightly-benchmarks.zip
|
||||||
```
|
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
||||||
|
```
|
||||||
|
|
||||||
And the results will be inside `./benchmarks/results`.
|
And the results will be inside `./benchmarks/results`.
|
||||||
|
|
||||||
|
|||||||
@ -2,6 +2,7 @@
|
|||||||
# Nightly benchmark
|
# Nightly benchmark
|
||||||
|
|
||||||
This benchmark aims to:
|
This benchmark aims to:
|
||||||
|
|
||||||
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
|
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
|
||||||
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
|
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
|
||||||
|
|
||||||
@ -9,7 +10,6 @@ Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html)
|
|||||||
|
|
||||||
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
||||||
|
|
||||||
|
|
||||||
## Setup
|
## Setup
|
||||||
|
|
||||||
- Docker images:
|
- Docker images:
|
||||||
@ -33,7 +33,7 @@ Latest reproduction guilde: [github issue link](https://github.com/vllm-project/
|
|||||||
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
|
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
|
||||||
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
||||||
|
|
||||||
# Known issues
|
## Known issues
|
||||||
|
|
||||||
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
|
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
|
||||||
- TGI does not support `ignore-eos` flag.
|
- TGI does not support `ignore-eos` flag.
|
||||||
|
|||||||
@ -7,10 +7,8 @@
|
|||||||
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
||||||
|
|
||||||
|
|
||||||
{latency_tests_markdown_table}
|
{latency_tests_markdown_table}
|
||||||
|
|
||||||
|
|
||||||
## Throughput tests
|
## Throughput tests
|
||||||
|
|
||||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||||
@ -19,10 +17,8 @@
|
|||||||
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
- Evaluation metrics: throughput.
|
- Evaluation metrics: throughput.
|
||||||
|
|
||||||
|
|
||||||
{throughput_tests_markdown_table}
|
{throughput_tests_markdown_table}
|
||||||
|
|
||||||
|
|
||||||
## Serving tests
|
## Serving tests
|
||||||
|
|
||||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||||
@ -33,13 +29,11 @@
|
|||||||
- We also added a speculative decoding test for llama-3 70B, under QPS 2
|
- We also added a speculative decoding test for llama-3 70B, under QPS 2
|
||||||
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
||||||
|
|
||||||
|
|
||||||
{serving_tests_markdown_table}
|
{serving_tests_markdown_table}
|
||||||
|
|
||||||
|
|
||||||
## json version of the benchmarking tables
|
## json version of the benchmarking tables
|
||||||
|
|
||||||
This section contains the data of the markdown tables above in JSON format.
|
This section contains the data of the markdown tables above in JSON format.
|
||||||
You can load the benchmarking tables into pandas dataframes as follows:
|
You can load the benchmarking tables into pandas dataframes as follows:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@ -54,9 +48,9 @@ serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
|
|||||||
```
|
```
|
||||||
|
|
||||||
The json string for all benchmarking tables:
|
The json string for all benchmarking tables:
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{benchmarking_results_in_json_string}
|
{benchmarking_results_in_json_string}
|
||||||
```
|
```
|
||||||
|
|
||||||
You can also check the raw experiment data in the Artifact tab of the Buildkite page.
|
You can also check the raw experiment data in the Artifact tab of the Buildkite page.
|
||||||
|
|
||||||
|
|||||||
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -82,8 +84,13 @@ if __name__ == "__main__":
|
|||||||
# this result is generated via `benchmark_serving.py`
|
# this result is generated via `benchmark_serving.py`
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
with open(test_file.with_suffix(".commands")) as f:
|
try:
|
||||||
command = json.loads(f.read())
|
with open(test_file.with_suffix(".commands")) as f:
|
||||||
|
command = json.loads(f.read())
|
||||||
|
except OSError as e:
|
||||||
|
print(e)
|
||||||
|
continue
|
||||||
|
|
||||||
raw_result.update(command)
|
raw_result.update(command)
|
||||||
|
|
||||||
# update the test name of this result
|
# update the test name of this result
|
||||||
@ -97,8 +104,13 @@ if __name__ == "__main__":
|
|||||||
# this result is generated via `benchmark_latency.py`
|
# this result is generated via `benchmark_latency.py`
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
with open(test_file.with_suffix(".commands")) as f:
|
try:
|
||||||
command = json.loads(f.read())
|
with open(test_file.with_suffix(".commands")) as f:
|
||||||
|
command = json.loads(f.read())
|
||||||
|
except OSError as e:
|
||||||
|
print(e)
|
||||||
|
continue
|
||||||
|
|
||||||
raw_result.update(command)
|
raw_result.update(command)
|
||||||
|
|
||||||
# update the test name of this result
|
# update the test name of this result
|
||||||
@ -119,8 +131,13 @@ if __name__ == "__main__":
|
|||||||
# this result is generated via `benchmark_throughput.py`
|
# this result is generated via `benchmark_throughput.py`
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
# attach the benchmarking command to raw_result
|
||||||
with open(test_file.with_suffix(".commands")) as f:
|
try:
|
||||||
command = json.loads(f.read())
|
with open(test_file.with_suffix(".commands")) as f:
|
||||||
|
command = json.loads(f.read())
|
||||||
|
except OSError as e:
|
||||||
|
print(e)
|
||||||
|
continue
|
||||||
|
|
||||||
raw_result.update(command)
|
raw_result.update(command)
|
||||||
|
|
||||||
# update the test name of this result
|
# update the test name of this result
|
||||||
|
|||||||
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|
||||||
from transformers import AutoTokenizer
|
from transformers import AutoTokenizer
|
||||||
|
|||||||
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|||||||
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
from lmdeploy.serve.openai.api_client import APIClient
|
from lmdeploy.serve.openai.api_client import APIClient
|
||||||
|
|
||||||
api_client = APIClient("http://localhost:8000")
|
api_client = APIClient("http://localhost:8000")
|
||||||
|
|||||||
@ -43,7 +43,7 @@ main() {
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
# The figures should be genereated by a separate process outside the CI/CD pipeline
|
# The figures should be generated by a separate process outside the CI/CD pipeline
|
||||||
|
|
||||||
# # generate figures
|
# # generate figures
|
||||||
# python3 -m pip install tabulate pandas matplotlib
|
# python3 -m pip install tabulate pandas matplotlib
|
||||||
|
|||||||
@ -301,6 +301,104 @@ run_serving_tests() {
|
|||||||
kill_gpu_processes
|
kill_gpu_processes
|
||||||
}
|
}
|
||||||
|
|
||||||
|
run_genai_perf_tests() {
|
||||||
|
# run genai-perf tests
|
||||||
|
|
||||||
|
# $1: a json file specifying genai-perf test cases
|
||||||
|
local genai_perf_test_file
|
||||||
|
genai_perf_test_file=$1
|
||||||
|
|
||||||
|
# Iterate over genai-perf tests
|
||||||
|
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
|
||||||
|
# get the test name, and append the GPU type back to it.
|
||||||
|
test_name=$(echo "$params" | jq -r '.test_name')
|
||||||
|
|
||||||
|
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
||||||
|
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
||||||
|
echo "Skip test case $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
# prepend the current serving engine to the test name
|
||||||
|
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
||||||
|
|
||||||
|
# get common parameters
|
||||||
|
common_params=$(echo "$params" | jq -r '.common_parameters')
|
||||||
|
model=$(echo "$common_params" | jq -r '.model')
|
||||||
|
tp=$(echo "$common_params" | jq -r '.tp')
|
||||||
|
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
||||||
|
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
||||||
|
port=$(echo "$common_params" | jq -r '.port')
|
||||||
|
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
||||||
|
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
|
||||||
|
|
||||||
|
# get client and server arguments
|
||||||
|
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
|
||||||
|
qps_list=$(echo "$params" | jq -r '.qps_list')
|
||||||
|
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
||||||
|
echo "Running over qps list $qps_list"
|
||||||
|
|
||||||
|
# check if there is enough GPU to run the test
|
||||||
|
if [[ $gpu_count -lt $tp ]]; then
|
||||||
|
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
||||||
|
continue
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ $reuse_server == "true" ]]; then
|
||||||
|
echo "Reuse previous server for test case $test_name"
|
||||||
|
else
|
||||||
|
kill_gpu_processes
|
||||||
|
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
|
||||||
|
"$server_params" "$common_params"
|
||||||
|
fi
|
||||||
|
|
||||||
|
if wait_for_server; then
|
||||||
|
echo ""
|
||||||
|
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
|
||||||
|
else
|
||||||
|
echo ""
|
||||||
|
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
|
||||||
|
# iterate over different QPS
|
||||||
|
for qps in $qps_list; do
|
||||||
|
# remove the surrounding single quote from qps
|
||||||
|
if [[ "$qps" == *"inf"* ]]; then
|
||||||
|
echo "qps was $qps"
|
||||||
|
qps=$num_prompts
|
||||||
|
echo "now qps is $qps"
|
||||||
|
fi
|
||||||
|
|
||||||
|
new_test_name=$test_name"_qps_"$qps
|
||||||
|
backend=$CURRENT_LLM_SERVING_ENGINE
|
||||||
|
|
||||||
|
if [[ "$backend" == *"vllm"* ]]; then
|
||||||
|
backend="vllm"
|
||||||
|
fi
|
||||||
|
#TODO: add output dir.
|
||||||
|
client_command="genai-perf profile \
|
||||||
|
-m $model \
|
||||||
|
--service-kind openai \
|
||||||
|
--backend vllm \
|
||||||
|
--endpoint-type chat \
|
||||||
|
--streaming \
|
||||||
|
--url localhost:$port \
|
||||||
|
--request-rate $qps \
|
||||||
|
--num-prompts $num_prompts \
|
||||||
|
"
|
||||||
|
|
||||||
|
echo "Client command: $client_command"
|
||||||
|
|
||||||
|
eval "$client_command"
|
||||||
|
|
||||||
|
#TODO: process/record outputs
|
||||||
|
done
|
||||||
|
done
|
||||||
|
|
||||||
|
kill_gpu_processes
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
prepare_dataset() {
|
prepare_dataset() {
|
||||||
|
|
||||||
@ -328,12 +426,17 @@ main() {
|
|||||||
|
|
||||||
pip install -U transformers
|
pip install -U transformers
|
||||||
|
|
||||||
|
pip install -r requirements/dev.txt
|
||||||
|
which genai-perf
|
||||||
|
|
||||||
# check storage
|
# check storage
|
||||||
df -h
|
df -h
|
||||||
|
|
||||||
ensure_installed wget
|
ensure_installed wget
|
||||||
ensure_installed curl
|
ensure_installed curl
|
||||||
ensure_installed jq
|
ensure_installed jq
|
||||||
|
# genai-perf dependency
|
||||||
|
ensure_installed libb64-0d
|
||||||
|
|
||||||
prepare_dataset
|
prepare_dataset
|
||||||
|
|
||||||
@ -345,6 +448,10 @@ main() {
|
|||||||
# run the test
|
# run the test
|
||||||
run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
|
run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
|
||||||
|
|
||||||
|
# run genai-perf tests
|
||||||
|
run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
|
||||||
|
mv artifacts/ $RESULTS_FOLDER/
|
||||||
|
|
||||||
# upload benchmark results to buildkite
|
# upload benchmark results to buildkite
|
||||||
python3 -m pip install tabulate pandas
|
python3 -m pip install tabulate pandas
|
||||||
python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
|
python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
|
||||||
|
|||||||
@ -309,11 +309,14 @@ run_serving_tests() {
|
|||||||
|
|
||||||
new_test_name=$test_name"_qps_"$qps
|
new_test_name=$test_name"_qps_"$qps
|
||||||
|
|
||||||
|
# pass the tensor parallel size to the client so that it can be displayed
|
||||||
|
# on the benchmark dashboard
|
||||||
client_command="python3 benchmark_serving.py \
|
client_command="python3 benchmark_serving.py \
|
||||||
--save-result \
|
--save-result \
|
||||||
--result-dir $RESULTS_FOLDER \
|
--result-dir $RESULTS_FOLDER \
|
||||||
--result-filename ${new_test_name}.json \
|
--result-filename ${new_test_name}.json \
|
||||||
--request-rate $qps \
|
--request-rate $qps \
|
||||||
|
--metadata "tensor_parallel_size=$tp" \
|
||||||
$client_args"
|
$client_args"
|
||||||
|
|
||||||
echo "Running test case $test_name with qps $qps"
|
echo "Running test case $test_name with qps $qps"
|
||||||
@ -345,6 +348,11 @@ main() {
|
|||||||
check_gpus
|
check_gpus
|
||||||
check_hf_token
|
check_hf_token
|
||||||
|
|
||||||
|
# Set to v1 to run v1 benchmark
|
||||||
|
if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
|
||||||
|
export VLLM_USE_V1=1
|
||||||
|
fi
|
||||||
|
|
||||||
# dependencies
|
# dependencies
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
(which jq) || (apt-get update && apt-get -y install jq)
|
(which jq) || (apt-get update && apt-get -y install jq)
|
||||||
@ -353,7 +361,7 @@ main() {
|
|||||||
# get the current IP address, required by benchmark_serving.py
|
# get the current IP address, required by benchmark_serving.py
|
||||||
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
|
||||||
# turn of the reporting of the status of each request, to clean up the terminal output
|
# turn of the reporting of the status of each request, to clean up the terminal output
|
||||||
export VLLM_LOG_LEVEL="WARNING"
|
export VLLM_LOGGING_LEVEL="WARNING"
|
||||||
|
|
||||||
# prepare for benchmarking
|
# prepare for benchmarking
|
||||||
cd benchmarks || exit 1
|
cd benchmarks || exit 1
|
||||||
|
|||||||
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
|||||||
@ -1,6 +1,10 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
|
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
|
||||||
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
|
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
|
||||||
|
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
|
||||||
|
else
|
||||||
|
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
|
||||||
|
fi
|
||||||
|
|
||||||
TIMEOUT_SECONDS=10
|
TIMEOUT_SECONDS=10
|
||||||
|
|
||||||
|
|||||||
23
.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
Normal file
23
.buildkite/nightly-benchmarks/tests/genai-perf-tests.json
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "llama8B_tp1_genai_perf",
|
||||||
|
"qps_list": [4,8,16,32],
|
||||||
|
"common_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
|
||||||
|
"tp": 1,
|
||||||
|
"port": 8000,
|
||||||
|
"num_prompts": 500,
|
||||||
|
"reuse_server": false
|
||||||
|
},
|
||||||
|
"vllm_server_parameters": {
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"disable_log_requests": "",
|
||||||
|
"gpu_memory_utilization": 0.9,
|
||||||
|
"num_scheduler_steps": 10,
|
||||||
|
"max_num_seqs": 512,
|
||||||
|
"dtype": "bfloat16"
|
||||||
|
},
|
||||||
|
"genai_perf_input_parameters": {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
@ -29,4 +29,4 @@
|
|||||||
"num-iters": 15
|
"num-iters": 15
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|||||||
@ -63,11 +63,12 @@
|
|||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
"disable_log_requests": "",
|
"disable_log_requests": "",
|
||||||
"tensor_parallel_size": 4,
|
"tensor_parallel_size": 4,
|
||||||
"swap_space": 16,
|
"swap_space": 16,
|
||||||
"speculative_model": "turboderp/Qwama-0.5B-Instruct",
|
"speculative_config": {
|
||||||
"num_speculative_tokens": 4,
|
"model": "turboderp/Qwama-0.5B-Instruct",
|
||||||
"speculative_draft_tensor_parallel_size": 1,
|
"num_speculative_tokens": 4,
|
||||||
"use_v2_block_manager": ""
|
"draft_tensor_parallel_size": 1
|
||||||
|
}
|
||||||
},
|
},
|
||||||
"client_parameters": {
|
"client_parameters": {
|
||||||
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
|
|||||||
@ -32,4 +32,4 @@
|
|||||||
"backend": "vllm"
|
"backend": "vllm"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|||||||
@ -1,9 +1,20 @@
|
|||||||
steps:
|
steps:
|
||||||
|
- label: "Build wheel - CUDA 12.4"
|
||||||
|
agents:
|
||||||
|
queue: cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
|
- "mkdir artifacts"
|
||||||
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
|
- "bash .buildkite/upload-wheels.sh"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build wheel - CUDA 12.1"
|
- label: "Build wheel - CUDA 12.1"
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/upload-wheels.sh"
|
- "bash .buildkite/upload-wheels.sh"
|
||||||
@ -20,7 +31,7 @@ steps:
|
|||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=11.8.0 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/upload-wheels.sh"
|
- "bash .buildkite/upload-wheels.sh"
|
||||||
@ -37,7 +48,7 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
- label: "Build and publish TPU release image"
|
- label: "Build and publish TPU release image"
|
||||||
@ -46,7 +57,7 @@ steps:
|
|||||||
agents:
|
agents:
|
||||||
queue: tpu_queue_postmerge
|
queue: tpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f Dockerfile.tpu ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
|
||||||
- "docker push vllm/vllm-tpu:nightly"
|
- "docker push vllm/vllm-tpu:nightly"
|
||||||
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
||||||
plugins:
|
plugins:
|
||||||
@ -56,6 +67,11 @@ steps:
|
|||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
- input: "Provide Release version here"
|
||||||
|
fields:
|
||||||
|
- text: "What is the release version?"
|
||||||
|
key: "release-version"
|
||||||
|
|
||||||
- block: "Build CPU release image"
|
- block: "Build CPU release image"
|
||||||
key: block-cpu-release-image-build
|
key: block-cpu-release-image-build
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
@ -66,7 +82,7 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION --progress plain -f Dockerfile.cpu ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$RELEASE_VERSION"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|||||||
@ -77,7 +77,6 @@ echo "Commands:$commands"
|
|||||||
#ignore certain kernels tests
|
#ignore certain kernels tests
|
||||||
if [[ $commands == *" kernels "* ]]; then
|
if [[ $commands == *" kernels "* ]]; then
|
||||||
commands="${commands} \
|
commands="${commands} \
|
||||||
--ignore=kernels/test_attention.py \
|
|
||||||
--ignore=kernels/test_attention_selector.py \
|
--ignore=kernels/test_attention_selector.py \
|
||||||
--ignore=kernels/test_blocksparse_attention.py \
|
--ignore=kernels/test_blocksparse_attention.py \
|
||||||
--ignore=kernels/test_causal_conv1d.py \
|
--ignore=kernels/test_causal_conv1d.py \
|
||||||
@ -92,19 +91,40 @@ if [[ $commands == *" kernels "* ]]; then
|
|||||||
--ignore=kernels/test_moe.py \
|
--ignore=kernels/test_moe.py \
|
||||||
--ignore=kernels/test_prefix_prefill.py \
|
--ignore=kernels/test_prefix_prefill.py \
|
||||||
--ignore=kernels/test_rand.py \
|
--ignore=kernels/test_rand.py \
|
||||||
--ignore=kernels/test_sampler.py"
|
--ignore=kernels/test_sampler.py \
|
||||||
|
--ignore=kernels/test_cascade_flash_attn.py \
|
||||||
|
--ignore=kernels/test_mamba_mixer2.py \
|
||||||
|
--ignore=kernels/test_aqlm.py \
|
||||||
|
--ignore=kernels/test_machete_mm.py \
|
||||||
|
--ignore=kernels/test_mha_attn.py \
|
||||||
|
--ignore=kernels/test_block_fp8.py \
|
||||||
|
--ignore=kernels/test_permute_cols.py"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
#ignore certain Entrypoints tests
|
#ignore certain Entrypoints/openai tests
|
||||||
if [[ $commands == *" entrypoints/openai "* ]]; then
|
if [[ $commands == *" entrypoints/openai "* ]]; then
|
||||||
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
|
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
|
||||||
--ignore=entrypoints/openai/test_accuracy.py \
|
|
||||||
--ignore=entrypoints/openai/test_audio.py \
|
--ignore=entrypoints/openai/test_audio.py \
|
||||||
--ignore=entrypoints/openai/test_encoder_decoder.py \
|
--ignore=entrypoints/openai/test_chat.py \
|
||||||
--ignore=entrypoints/openai/test_embedding.py \
|
--ignore=entrypoints/openai/test_shutdown.py \
|
||||||
--ignore=entrypoints/openai/test_oot_registration.py "}
|
--ignore=entrypoints/openai/test_completion.py \
|
||||||
|
--ignore=entrypoints/openai/test_sleep.py \
|
||||||
|
--ignore=entrypoints/openai/test_models.py \
|
||||||
|
--ignore=entrypoints/openai/test_prompt_validation.py "}
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
#ignore certain Entrypoints/llm tests
|
||||||
|
if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
|
||||||
|
commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
|
||||||
|
fi
|
||||||
|
|
||||||
|
# --ignore=entrypoints/openai/test_encoder_decoder.py \
|
||||||
|
# --ignore=entrypoints/openai/test_embedding.py \
|
||||||
|
# --ignore=entrypoints/openai/test_oot_registration.py
|
||||||
|
# --ignore=entrypoints/openai/test_accuracy.py \
|
||||||
|
# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
|
||||||
|
|
||||||
|
|
||||||
PARALLEL_JOB_COUNT=8
|
PARALLEL_JOB_COUNT=8
|
||||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||||
if [[ $commands == *"--shard-id="* ]]; then
|
if [[ $commands == *"--shard-id="* ]]; then
|
||||||
@ -114,13 +134,16 @@ if [[ $commands == *"--shard-id="* ]]; then
|
|||||||
# assign shard-id for each shard
|
# assign shard-id for each shard
|
||||||
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
|
commands_gpu=${commands//"--shard-id= "/"--shard-id=${GPU} "}
|
||||||
echo "Shard ${GPU} commands:$commands_gpu"
|
echo "Shard ${GPU} commands:$commands_gpu"
|
||||||
|
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
||||||
docker run \
|
docker run \
|
||||||
--device /dev/kfd --device /dev/dri \
|
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||||
--network host \
|
--network=host \
|
||||||
--shm-size=16gb \
|
--shm-size=16gb \
|
||||||
--rm \
|
--rm \
|
||||||
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
||||||
-e HF_TOKEN \
|
-e HF_TOKEN \
|
||||||
|
-e AWS_ACCESS_KEY_ID \
|
||||||
|
-e AWS_SECRET_ACCESS_KEY \
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
--name "${container_name}_${GPU}" \
|
--name "${container_name}_${GPU}" \
|
||||||
@ -141,13 +164,16 @@ if [[ $commands == *"--shard-id="* ]]; then
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
else
|
else
|
||||||
|
echo "Render devices: $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES"
|
||||||
docker run \
|
docker run \
|
||||||
--device /dev/kfd --device /dev/dri \
|
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||||
--network host \
|
--network=host \
|
||||||
--shm-size=16gb \
|
--shm-size=16gb \
|
||||||
--rm \
|
--rm \
|
||||||
-e HIP_VISIBLE_DEVICES=0 \
|
-e HIP_VISIBLE_DEVICES=0 \
|
||||||
-e HF_TOKEN \
|
-e HF_TOKEN \
|
||||||
|
-e AWS_ACCESS_KEY_ID \
|
||||||
|
-e AWS_SECRET_ACCESS_KEY \
|
||||||
-v "${HF_CACHE}:${HF_MOUNT}" \
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
-e "HF_HOME=${HF_MOUNT}" \
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
--name "${container_name}" \
|
--name "${container_name}" \
|
||||||
|
|||||||
@ -10,5 +10,5 @@ trap remove_docker_container EXIT
|
|||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
docker build -t cpu-test -f Dockerfile.ppc64le .
|
docker build -t cpu-test -f docker/Dockerfile.ppc64le .
|
||||||
|
|
||||||
|
|||||||
@ -8,37 +8,40 @@ set -ex
|
|||||||
CORE_RANGE=${CORE_RANGE:-48-95}
|
CORE_RANGE=${CORE_RANGE:-48-95}
|
||||||
NUMA_NODE=${NUMA_NODE:-1}
|
NUMA_NODE=${NUMA_NODE:-1}
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build -t cpu-test -f Dockerfile.cpu .
|
|
||||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" -t cpu-test-avx2 -f Dockerfile.cpu .
|
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() { docker rm -f cpu-test-"$NUMA_NODE" cpu-test-avx2-"$NUMA_NODE" || true; }
|
remove_docker_container() {
|
||||||
|
set -e;
|
||||||
|
docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true;
|
||||||
|
docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true;
|
||||||
|
}
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
|
||||||
|
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||||
|
|
||||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||||
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test
|
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
|
||||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||||
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-avx2-"$NUMA_NODE" cpu-test-avx2
|
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
|
||||||
|
|
||||||
function cpu_tests() {
|
function cpu_tests() {
|
||||||
set -e
|
set -e
|
||||||
export NUMA_NODE=$2
|
export NUMA_NODE=$2
|
||||||
|
export BUILDKITE_BUILD_NUMBER=$3
|
||||||
|
|
||||||
# offline inference
|
# offline inference
|
||||||
docker exec cpu-test-avx2-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
python3 examples/offline_inference.py"
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pip install pytest pytest-asyncio \
|
pytest -v -s tests/kernels/test_cache.py -m cpu_model
|
||||||
decord einops librosa peft Pillow sentence-transformers soundfile \
|
pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
|
||||||
transformers_stream_generator matplotlib datamodel_code_generator
|
|
||||||
pip install torchvision --index-url https://download.pytorch.org/whl/cpu
|
|
||||||
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
||||||
pytest -v -s tests/models/embedding/language -m cpu_model
|
pytest -v -s tests/models/embedding/language -m cpu_model
|
||||||
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
|
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
|
||||||
@ -46,26 +49,26 @@ function cpu_tests() {
|
|||||||
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
|
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
|
||||||
|
|
||||||
# Run compressed-tensor test
|
# Run compressed-tensor test
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -s -v \
|
pytest -s -v \
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
|
||||||
|
|
||||||
# Run AWQ test
|
# Run AWQ test
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -s -v \
|
pytest -s -v \
|
||||||
tests/quantization/test_ipex_quant.py"
|
tests/quantization/test_ipex_quant.py"
|
||||||
|
|
||||||
# Run chunked-prefill and prefix-cache test
|
# Run chunked-prefill and prefix-cache test
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -s -v -k cpu_model \
|
pytest -s -v -k cpu_model \
|
||||||
tests/basic_correctness/test_chunked_prefill.py"
|
tests/basic_correctness/test_chunked_prefill.py"
|
||||||
|
|
||||||
# online inference
|
# online serving
|
||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
export VLLM_CPU_KVCACHE_SPACE=10
|
export VLLM_CPU_KVCACHE_SPACE=10
|
||||||
export VLLM_CPU_OMP_THREADS_BIND=$1
|
export VLLM_CPU_OMP_THREADS_BIND=$1
|
||||||
@ -78,8 +81,14 @@ function cpu_tests() {
|
|||||||
--num-prompts 20 \
|
--num-prompts 20 \
|
||||||
--endpoint /v1/completions \
|
--endpoint /v1/completions \
|
||||||
--tokenizer facebook/opt-125m"
|
--tokenizer facebook/opt-125m"
|
||||||
|
|
||||||
|
# Run multi-lora tests
|
||||||
|
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||||
|
set -e
|
||||||
|
pytest -s -v \
|
||||||
|
tests/lora/test_qwen2vl.py"
|
||||||
}
|
}
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 25 mins.
|
# All of CPU tests are expected to be finished less than 40 mins.
|
||||||
export -f cpu_tests
|
export -f cpu_tests
|
||||||
timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
|
||||||
|
|||||||
@ -9,11 +9,13 @@ python3 use_existing_torch.py
|
|||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
DOCKER_BUILDKIT=1 docker build . \
|
DOCKER_BUILDKIT=1 docker build . \
|
||||||
|
--file docker/Dockerfile \
|
||||||
--target vllm-openai \
|
--target vllm-openai \
|
||||||
--platform "linux/arm64" \
|
--platform "linux/arm64" \
|
||||||
-t gh200-test \
|
-t gh200-test \
|
||||||
--build-arg max_jobs=66 \
|
--build-arg max_jobs=66 \
|
||||||
--build-arg nvcc_threads=2 \
|
--build-arg nvcc_threads=2 \
|
||||||
|
--build-arg RUN_WHEEL_CHECK=false \
|
||||||
--build-arg torch_cuda_arch_list="9.0+PTX" \
|
--build-arg torch_cuda_arch_list="9.0+PTX" \
|
||||||
--build-arg vllm_fa_cmake_gpu_arches="90-real"
|
--build-arg vllm_fa_cmake_gpu_arches="90-real"
|
||||||
|
|
||||||
@ -23,6 +25,6 @@ trap remove_docker_container EXIT
|
|||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Run the image and test offline inference
|
# Run the image and test offline inference
|
||||||
docker run --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
docker run -e HF_TOKEN -e VLLM_WORKER_MULTIPROC_METHOD=spawn -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
|
||||||
python3 examples/offline_inference.py
|
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
|
||||||
'
|
'
|
||||||
|
|||||||
@ -5,12 +5,20 @@
|
|||||||
set -ex
|
set -ex
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
docker build -t hpu-test-env -f Dockerfile.hpu .
|
docker build -t hpu-test-env -f docker/Dockerfile.hpu .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
|
# certain versions of HPU software stack have a bug that can
|
||||||
|
# override the exit code of the script, so we need to use
|
||||||
|
# separate remove_docker_container and remove_docker_container_and_exit
|
||||||
|
# functions, while other platforms only need one remove_docker_container
|
||||||
|
# function.
|
||||||
|
EXITCODE=1
|
||||||
remove_docker_container() { docker rm -f hpu-test || true; }
|
remove_docker_container() { docker rm -f hpu-test || true; }
|
||||||
trap remove_docker_container EXIT
|
remove_docker_container_and_exit() { remove_docker_container; exit $EXITCODE; }
|
||||||
|
trap remove_docker_container_and_exit EXIT
|
||||||
remove_docker_container
|
remove_docker_container
|
||||||
|
|
||||||
# Run the image and launch offline inference
|
# Run the image and launch offline inference
|
||||||
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference.py
|
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||||
|
EXITCODE=$?
|
||||||
|
|||||||
@ -3,6 +3,18 @@
|
|||||||
# This script build the Neuron docker image and run the API server inside the container.
|
# This script build the Neuron docker image and run the API server inside the container.
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
set -e
|
set -e
|
||||||
|
set -v
|
||||||
|
|
||||||
|
image_name="neuron/vllm-ci"
|
||||||
|
container_name="neuron_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||||
|
|
||||||
|
HF_CACHE="$(realpath ~)/huggingface"
|
||||||
|
mkdir -p "${HF_CACHE}"
|
||||||
|
HF_MOUNT="/root/.cache/huggingface"
|
||||||
|
|
||||||
|
NEURON_COMPILE_CACHE_URL="$(realpath ~)/neuron_compile_cache"
|
||||||
|
mkdir -p "${NEURON_COMPILE_CACHE_URL}"
|
||||||
|
NEURON_COMPILE_CACHE_MOUNT="/root/.cache/neuron_compile_cache"
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
|
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
|
||||||
@ -13,41 +25,30 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
|
|||||||
last_build=$(cat /tmp/neuron-docker-build-timestamp)
|
last_build=$(cat /tmp/neuron-docker-build-timestamp)
|
||||||
current_time=$(date +%s)
|
current_time=$(date +%s)
|
||||||
if [ $((current_time - last_build)) -gt 86400 ]; then
|
if [ $((current_time - last_build)) -gt 86400 ]; then
|
||||||
docker system prune -f
|
# Remove dangling images (those that are not tagged and not used by any container)
|
||||||
|
docker image prune -f
|
||||||
|
# Remove unused volumes / force the system prune for old images as well.
|
||||||
|
docker volume prune -f && docker system prune -f
|
||||||
echo "$current_time" > /tmp/neuron-docker-build-timestamp
|
echo "$current_time" > /tmp/neuron-docker-build-timestamp
|
||||||
fi
|
fi
|
||||||
else
|
else
|
||||||
date "+%s" > /tmp/neuron-docker-build-timestamp
|
date "+%s" > /tmp/neuron-docker-build-timestamp
|
||||||
fi
|
fi
|
||||||
|
|
||||||
docker build -t neuron -f Dockerfile.neuron .
|
docker build -t "${image_name}" -f docker/Dockerfile.neuron .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() { docker rm -f neuron || true; }
|
remove_docker_container() {
|
||||||
|
docker image rm -f "${image_name}" || true;
|
||||||
|
}
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# Run the image
|
# Run the image
|
||||||
docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
|
docker run --rm -it --device=/dev/neuron0 --network bridge \
|
||||||
--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
|
-v "${HF_CACHE}:${HF_MOUNT}" \
|
||||||
|
-e "HF_HOME=${HF_MOUNT}" \
|
||||||
# Wait for the server to start
|
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||||
wait_for_server_to_start() {
|
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
|
||||||
timeout=300
|
--name "${container_name}" \
|
||||||
counter=0
|
${image_name} \
|
||||||
|
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"
|
||||||
while [ "$(curl -s -o /dev/null -w '%{http_code}' localhost:8000/health)" != "200" ]; do
|
|
||||||
sleep 1
|
|
||||||
counter=$((counter + 1))
|
|
||||||
if [ $counter -ge $timeout ]; then
|
|
||||||
echo "Timeout after $timeout seconds"
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
done
|
|
||||||
}
|
|
||||||
wait_for_server_to_start
|
|
||||||
|
|
||||||
# Test a simple prompt
|
|
||||||
curl -X POST -H "Content-Type: application/json" \
|
|
||||||
localhost:8000/generate \
|
|
||||||
-d '{"prompt": "San Francisco is a"}'
|
|
||||||
|
|||||||
@ -1,16 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
# This script build the OpenVINO docker image and run the offline inference inside the container.
|
|
||||||
# It serves a sanity check for compilation and basic model usage.
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# Try building the docker image
|
|
||||||
docker build -t openvino-test -f Dockerfile.openvino .
|
|
||||||
|
|
||||||
# Setup cleanup
|
|
||||||
remove_docker_container() { docker rm -f openvino-test || true; }
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# Run the image and launch offline inference
|
|
||||||
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
|
|
||||||
@ -1,17 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -e
|
|
||||||
|
|
||||||
# Build the docker image.
|
|
||||||
docker build -f Dockerfile.tpu -t vllm-tpu .
|
|
||||||
|
|
||||||
# Set up cleanup.
|
|
||||||
remove_docker_container() { docker rm -f tpu-test || true; }
|
|
||||||
trap remove_docker_container EXIT
|
|
||||||
# Remove the container that might not be cleaned up in the previous run.
|
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# For HF_TOKEN.
|
|
||||||
source /etc/environment
|
|
||||||
# Run a simple end-to-end example.
|
|
||||||
docker run --privileged --net host --shm-size=16G -it -e "HF_TOKEN=$HF_TOKEN" --name tpu-test vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git && python3 -m pip install pytest && python3 -m pip install lm_eval[api]==0.4.4 && pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py && pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py && python3 /workspace/vllm/tests/tpu/test_compilation.py && python3 /workspace/vllm/examples/offline_inference_tpu.py"
|
|
||||||
43
.buildkite/run-tpu-v1-test.sh
Executable file
43
.buildkite/run-tpu-v1-test.sh
Executable file
@ -0,0 +1,43 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Build the docker image.
|
||||||
|
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
||||||
|
|
||||||
|
# Set up cleanup.
|
||||||
|
remove_docker_container() { docker rm -f tpu-test || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
# Remove the container that might not be cleaned up in the previous run.
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# For HF_TOKEN.
|
||||||
|
source /etc/environment
|
||||||
|
# Run a simple end-to-end example.
|
||||||
|
docker run --privileged --net host --shm-size=16G -it \
|
||||||
|
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
||||||
|
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
||||||
|
&& python3 -m pip install pytest \
|
||||||
|
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
||||||
|
&& export VLLM_USE_V1=1 \
|
||||||
|
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
|
||||||
|
&& echo TEST_0 \
|
||||||
|
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_perf.py \
|
||||||
|
&& echo TEST_1 \
|
||||||
|
&& pytest -v -s /workspace/vllm/tests/tpu/test_compilation.py \
|
||||||
|
&& echo TEST_2 \
|
||||||
|
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
|
||||||
|
&& echo TEST_3 \
|
||||||
|
&& pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
|
||||||
|
&& echo TEST_4 \
|
||||||
|
&& pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
|
||||||
|
&& echo TEST_5 \
|
||||||
|
&& python3 /workspace/vllm/examples/offline_inference/tpu.py \
|
||||||
|
&& echo TEST_6 \
|
||||||
|
&& pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py \
|
||||||
|
&& echo TEST_7 \
|
||||||
|
&& pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py" \
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: This test fails because it uses RANDOM_SEED sampling
|
||||||
|
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||||
@ -4,16 +4,28 @@
|
|||||||
# It serves a sanity check for compilation and basic model usage.
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
set -ex
|
set -ex
|
||||||
|
|
||||||
|
image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
|
||||||
|
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
|
||||||
|
|
||||||
# Try building the docker image
|
# Try building the docker image
|
||||||
docker build -t xpu-test -f Dockerfile.xpu .
|
docker build -t ${image_name} -f docker/Dockerfile.xpu .
|
||||||
|
|
||||||
# Setup cleanup
|
# Setup cleanup
|
||||||
remove_docker_container() { docker rm -f xpu-test || true; }
|
remove_docker_container() {
|
||||||
|
docker rm -f "${container_name}" || true;
|
||||||
|
docker image rm -f "${image_name}" || true;
|
||||||
|
docker system prune -f || true;
|
||||||
|
}
|
||||||
trap remove_docker_container EXIT
|
trap remove_docker_container EXIT
|
||||||
remove_docker_container
|
|
||||||
|
|
||||||
# Run the image and test offline inference/tensor parallel
|
# Run the image and test offline inference/tensor parallel
|
||||||
docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
|
docker run \
|
||||||
python3 examples/offline_inference.py
|
--device /dev/dri \
|
||||||
python3 examples/offline_inference_cli.py -tp 2
|
-v /dev/dri/by-path:/dev/dri/by-path \
|
||||||
|
--entrypoint="" \
|
||||||
|
--name "${container_name}" \
|
||||||
|
"${image_name}" \
|
||||||
|
sh -c '
|
||||||
|
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||||
|
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
|
||||||
'
|
'
|
||||||
|
|||||||
@ -2,7 +2,7 @@
|
|||||||
# adding a new command to an existing step. See different options here for examples.
|
# adding a new command to an existing step. See different options here for examples.
|
||||||
|
|
||||||
# This script will be feed into Jinja template in `test-template-aws.j2` at
|
# This script will be feed into Jinja template in `test-template-aws.j2` at
|
||||||
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
|
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
|
||||||
# to generate the final pipeline yaml file.
|
# to generate the final pipeline yaml file.
|
||||||
|
|
||||||
# Documentation
|
# Documentation
|
||||||
@ -15,7 +15,7 @@
|
|||||||
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
|
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
|
||||||
# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
|
# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
|
||||||
# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
|
# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
|
||||||
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
|
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
|
||||||
# in this case, commands must be specified. the first command runs on first host, the second
|
# in this case, commands must be specified. the first command runs on first host, the second
|
||||||
# command runs on the second host.
|
# command runs on the second host.
|
||||||
# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
|
# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
|
||||||
@ -24,8 +24,8 @@
|
|||||||
# When adding a test
|
# When adding a test
|
||||||
# - If the test belong to an existing group, add it there
|
# - If the test belong to an existing group, add it there
|
||||||
# - If the test is short, add to any existing step
|
# - If the test is short, add to any existing step
|
||||||
# - If the test takes more than 10min, then it is okay to create a new step.
|
# - If the test takes more than 10min, then it is okay to create a new step.
|
||||||
# Note that all steps execute in parallel.
|
# Note that all steps execute in parallel.
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
##### fast check tests #####
|
##### fast check tests #####
|
||||||
@ -35,13 +35,12 @@ steps:
|
|||||||
fast_check: true
|
fast_check: true
|
||||||
no_gpu: True
|
no_gpu: True
|
||||||
commands:
|
commands:
|
||||||
- pip install -r requirements-docs.txt
|
- pip install -r ../../requirements/docs.txt
|
||||||
- SPHINXOPTS=\"-W\" make html
|
- SPHINXOPTS=\"-W\" make html
|
||||||
# Check API reference (if it fails, you may have missing mock imports)
|
# Check API reference (if it fails, you may have missing mock imports)
|
||||||
- grep \"sig sig-object py\" build/html/dev/sampling_params.html
|
- grep \"sig sig-object py\" build/html/api/inference_params.html
|
||||||
|
|
||||||
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
||||||
fast_check: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/mq_llm_engine
|
- tests/mq_llm_engine
|
||||||
@ -50,9 +49,9 @@ steps:
|
|||||||
- tests/multimodal
|
- tests/multimodal
|
||||||
- tests/test_utils
|
- tests/test_utils
|
||||||
- tests/worker
|
- tests/worker
|
||||||
- tests/standalone_tests/lazy_torch_compile.py
|
- tests/standalone_tests/lazy_imports.py
|
||||||
commands:
|
commands:
|
||||||
- python3 standalone_tests/lazy_torch_compile.py
|
- python3 standalone_tests/lazy_imports.py
|
||||||
- pytest -v -s mq_llm_engine # MQLLMEngine
|
- pytest -v -s mq_llm_engine # MQLLMEngine
|
||||||
- pytest -v -s async_engine # AsyncLLMEngine
|
- pytest -v -s async_engine # AsyncLLMEngine
|
||||||
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
|
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
|
||||||
@ -76,7 +75,10 @@ steps:
|
|||||||
- tests/basic_correctness/test_basic_correctness
|
- tests/basic_correctness/test_basic_correctness
|
||||||
- tests/basic_correctness/test_cpu_offload
|
- tests/basic_correctness/test_cpu_offload
|
||||||
- tests/basic_correctness/test_preemption
|
- tests/basic_correctness/test_preemption
|
||||||
|
- tests/basic_correctness/test_cumem.py
|
||||||
commands:
|
commands:
|
||||||
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
|
- pytest -v -s basic_correctness/test_cumem.py
|
||||||
- pytest -v -s basic_correctness/test_basic_correctness.py
|
- pytest -v -s basic_correctness/test_basic_correctness.py
|
||||||
- pytest -v -s basic_correctness/test_cpu_offload.py
|
- pytest -v -s basic_correctness/test_cpu_offload.py
|
||||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||||
@ -105,43 +107,61 @@ steps:
|
|||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
|
- tests/entrypoints/llm
|
||||||
|
- tests/entrypoints/openai
|
||||||
|
- tests/entrypoints/test_chat_utils
|
||||||
|
- tests/entrypoints/offline_mode
|
||||||
commands:
|
commands:
|
||||||
- pip install -e ./plugins/vllm_add_dummy_model
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
|
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
|
||||||
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
|
||||||
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
|
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/test_chat_with_tool_reasoning.py --ignore=entrypoints/openai/correctness/
|
||||||
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
|
||||||
- pytest -v -s entrypoints/test_chat_utils.py
|
- pytest -v -s entrypoints/test_chat_utils.py
|
||||||
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
|
||||||
|
|
||||||
- label: Distributed Tests (4 GPUs) # 10min
|
- label: Distributed Tests (4 GPUs) # 10min
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
fast_check: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/distributed/
|
- vllm/distributed/
|
||||||
- vllm/core/
|
- vllm/core/
|
||||||
- tests/distributed
|
- tests/distributed/test_utils
|
||||||
|
- tests/distributed/test_pynccl
|
||||||
- tests/spec_decode/e2e/test_integration_dist_tp4
|
- tests/spec_decode/e2e/test_integration_dist_tp4
|
||||||
- tests/compile
|
- tests/compile/test_basic_correctness
|
||||||
|
- examples/offline_inference/rlhf.py
|
||||||
|
- examples/offline_inference/rlhf_colocate.py
|
||||||
|
- tests/examples/offline_inference/data_parallel.py
|
||||||
|
- tests/v1/test_async_llm_dp.py
|
||||||
commands:
|
commands:
|
||||||
|
# test with tp=2 and external_dp=2
|
||||||
|
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||||
|
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||||
|
# test with internal dp
|
||||||
|
- python3 ../examples/offline_inference/data_parallel.py
|
||||||
|
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||||
- pytest -v -s distributed/test_utils.py
|
- pytest -v -s distributed/test_utils.py
|
||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
- pytest -v -s distributed/test_pynccl.py
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
|
||||||
|
# TODO: create a dedicated test section for multi-GPU example tests
|
||||||
|
# when we have multiple distributed example tests
|
||||||
|
- pushd ../examples/offline_inference
|
||||||
|
- python3 rlhf.py
|
||||||
|
- RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||||
|
- popd
|
||||||
|
|
||||||
- label: Metrics, Tracing Test # 10min
|
- label: Metrics, Tracing Test # 10min
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
fast_check: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/metrics
|
- tests/metrics
|
||||||
- tests/tracing
|
- tests/tracing
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s metrics
|
- pytest -v -s metrics
|
||||||
- "pip install \
|
- "pip install \
|
||||||
'opentelemetry-sdk>=1.26.0,<1.27.0' \
|
'opentelemetry-sdk>=1.26.0,<1.27.0' \
|
||||||
'opentelemetry-api>=1.26.0,<1.27.0' \
|
'opentelemetry-api>=1.26.0,<1.27.0' \
|
||||||
@ -168,6 +188,9 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/engine
|
- tests/engine
|
||||||
- tests/tokenization
|
- tests/tokenization
|
||||||
|
- tests/test_sequence
|
||||||
|
- tests/test_config
|
||||||
|
- tests/test_logger
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
|
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
|
||||||
# OOM in the CI unless we run this separately
|
# OOM in the CI unless we run this separately
|
||||||
@ -179,7 +202,23 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/v1
|
- tests/v1
|
||||||
commands:
|
commands:
|
||||||
- VLLM_USE_V1=1 pytest -v -s v1
|
# split the test to avoid interference
|
||||||
|
- pytest -v -s v1/core
|
||||||
|
- pytest -v -s v1/entrypoints
|
||||||
|
- pytest -v -s v1/engine
|
||||||
|
- pytest -v -s v1/entrypoints
|
||||||
|
- pytest -v -s v1/sample
|
||||||
|
- pytest -v -s v1/worker
|
||||||
|
- pytest -v -s v1/structured_output
|
||||||
|
- pytest -v -s v1/test_stats.py
|
||||||
|
- pytest -v -s v1/test_utils.py
|
||||||
|
- pytest -v -s v1/test_oracle.py
|
||||||
|
# TODO: accuracy does not match, whether setting
|
||||||
|
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
|
||||||
|
- pytest -v -s v1/e2e
|
||||||
|
# Integration test for streaming correctness (requires special branch).
|
||||||
|
- pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
|
||||||
|
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||||
|
|
||||||
- label: Examples Test # 25min
|
- label: Examples Test # 25min
|
||||||
working_dir: "/vllm-workspace/examples"
|
working_dir: "/vllm-workspace/examples"
|
||||||
@ -189,19 +228,22 @@ steps:
|
|||||||
- examples/
|
- examples/
|
||||||
commands:
|
commands:
|
||||||
- pip install tensorizer # for tensorizer test
|
- pip install tensorizer # for tensorizer test
|
||||||
- python3 offline_inference.py
|
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
|
||||||
- python3 cpu_offload.py
|
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
|
||||||
- python3 offline_inference_chat.py
|
- python3 offline_inference/basic/chat.py
|
||||||
- python3 offline_inference_with_prefix.py
|
- python3 offline_inference/prefix_caching.py
|
||||||
- python3 llm_engine_example.py
|
- python3 offline_inference/llm_engine_example.py
|
||||||
- python3 offline_inference_vision_language.py
|
- python3 offline_inference/audio_language.py --seed 0
|
||||||
- python3 offline_inference_vision_language_multi_image.py
|
- python3 offline_inference/vision_language.py --seed 0
|
||||||
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
- python3 offline_inference/vision_language_embedding.py --seed 0
|
||||||
- python3 offline_inference_encoder_decoder.py
|
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||||
- python3 offline_inference_classification.py
|
- VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
- python3 offline_inference_embedding.py
|
- python3 offline_inference/encoder_decoder.py
|
||||||
- python3 offline_inference_scoring.py
|
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||||
- python3 offline_profile.py --model facebook/opt-125m run_num_steps --num-steps 2
|
- python3 offline_inference/basic/classify.py
|
||||||
|
- python3 offline_inference/basic/embed.py
|
||||||
|
- python3 offline_inference/basic/score.py
|
||||||
|
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
|
||||||
|
|
||||||
- label: Prefix Caching Test # 9min
|
- label: Prefix Caching Test # 9min
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
@ -216,6 +258,7 @@ steps:
|
|||||||
- vllm/model_executor/layers
|
- vllm/model_executor/layers
|
||||||
- vllm/sampling_metadata.py
|
- vllm/sampling_metadata.py
|
||||||
- tests/samplers
|
- tests/samplers
|
||||||
|
- tests/conftest.py
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s samplers
|
- pytest -v -s samplers
|
||||||
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
||||||
@ -227,28 +270,29 @@ steps:
|
|||||||
- vllm/model_executor/guided_decoding
|
- vllm/model_executor/guided_decoding
|
||||||
- tests/test_logits_processor
|
- tests/test_logits_processor
|
||||||
- tests/model_executor/test_guided_processors
|
- tests/model_executor/test_guided_processors
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s test_logits_processor.py
|
- pytest -v -s test_logits_processor.py
|
||||||
- pytest -v -s model_executor/test_guided_processors.py
|
- pytest -v -s model_executor/test_guided_processors.py
|
||||||
|
|
||||||
- label: Speculative decoding tests # 30min
|
- label: Speculative decoding tests # 40min
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/spec_decode
|
- vllm/spec_decode
|
||||||
- tests/spec_decode
|
- tests/spec_decode
|
||||||
|
- vllm/model_executor/models/eagle.py
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
|
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
|
||||||
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
|
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py
|
||||||
|
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
|
||||||
|
|
||||||
- label: LoRA Test %N # 15min each
|
- label: LoRA Test %N # 15min each
|
||||||
mirror_hardwares: [amd]
|
mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/lora
|
- vllm/lora
|
||||||
- tests/lora
|
- tests/lora
|
||||||
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py
|
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py --ignore=lora/test_transfomers_model.py
|
||||||
parallelism: 4
|
parallelism: 4
|
||||||
|
|
||||||
- label: "PyTorch Fullgraph Smoke Test" # 9min
|
- label: PyTorch Fullgraph Smoke Test # 9min
|
||||||
fast_check: true
|
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
@ -257,8 +301,9 @@ steps:
|
|||||||
# these tests need to be separated, cannot combine
|
# these tests need to be separated, cannot combine
|
||||||
- pytest -v -s compile/piecewise/test_simple.py
|
- pytest -v -s compile/piecewise/test_simple.py
|
||||||
- pytest -v -s compile/piecewise/test_toy_llama.py
|
- pytest -v -s compile/piecewise/test_toy_llama.py
|
||||||
|
- pytest -v -s compile/test_pass_manager.py
|
||||||
|
|
||||||
- label: "PyTorch Fullgraph Test" # 18min
|
- label: PyTorch Fullgraph Test # 18min
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
@ -310,6 +355,14 @@ steps:
|
|||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- bash ./run-tests.sh -c configs/models-small.txt -t 1
|
- bash ./run-tests.sh -c configs/models-small.txt -t 1
|
||||||
|
|
||||||
|
- label: OpenAI API correctness
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/entrypoints/openai/
|
||||||
|
- vllm/model_executor/models/whisper.py
|
||||||
|
commands: # LMEval+Transcription WER check
|
||||||
|
- pytest -s entrypoints/openai/correctness/
|
||||||
|
|
||||||
- label: Encoder Decoder tests # 5min
|
- label: Encoder Decoder tests # 5min
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@ -333,10 +386,10 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/models
|
- tests/models
|
||||||
commands:
|
commands:
|
||||||
- pip install -e ./plugins/vllm_add_dummy_model
|
- pytest -v -s models/test_transformers.py
|
||||||
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
|
||||||
- pytest -v -s models/test_registry.py
|
- pytest -v -s models/test_registry.py
|
||||||
- pytest -v -s models/test_initialization.py
|
# V1 Test: https://github.com/vllm-project/vllm/issues/14531
|
||||||
|
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py
|
||||||
|
|
||||||
- label: Language Models Test (Standard) # 32min
|
- label: Language Models Test (Standard) # 32min
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
@ -360,23 +413,27 @@ steps:
|
|||||||
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
|
- pytest -v -s models/decoder_only/language -m 'not core_model and not quant_model'
|
||||||
- pytest -v -s models/embedding/language -m 'not core_model'
|
- pytest -v -s models/embedding/language -m 'not core_model'
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Standard) # 28min
|
- label: Multi-Modal Models Test (Standard) # 40min
|
||||||
#mirror_hardwares: [amd]
|
#mirror_hardwares: [amd]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/decoder_only/audio_language
|
- tests/models/decoder_only/audio_language
|
||||||
- tests/models/decoder_only/vision_language
|
- tests/models/decoder_only/vision_language
|
||||||
- tests/models/embedding/vision_language
|
- tests/models/embedding/vision_language
|
||||||
|
- tests/models/encoder_decoder/audio_language
|
||||||
- tests/models/encoder_decoder/vision_language
|
- tests/models/encoder_decoder/vision_language
|
||||||
commands:
|
commands:
|
||||||
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
|
||||||
|
- pytest -v -s models/multimodal
|
||||||
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
|
- pytest -v -s models/decoder_only/audio_language -m 'core_model or quant_model'
|
||||||
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
|
- pytest -v -s --ignore models/decoder_only/vision_language/test_phi3v.py models/decoder_only/vision_language -m 'core_model or quant_model'
|
||||||
- pytest -v -s models/embedding/vision_language -m core_model
|
- pytest -v -s models/embedding/vision_language -m core_model
|
||||||
|
- pytest -v -s models/encoder_decoder/audio_language -m core_model
|
||||||
- pytest -v -s models/encoder_decoder/language -m core_model
|
- pytest -v -s models/encoder_decoder/language -m core_model
|
||||||
- pytest -v -s models/encoder_decoder/vision_language -m core_model
|
- pytest -v -s models/encoder_decoder/vision_language -m core_model
|
||||||
|
- pytest -v -s models/decoder_only/vision_language/test_interleaved.py
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 1 # 1h16m
|
- label: Multi-Modal Models Test (Extended) 1 # 48m
|
||||||
optional: true
|
optional: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@ -459,20 +516,45 @@ steps:
|
|||||||
- vllm/worker/worker_base.py
|
- vllm/worker/worker_base.py
|
||||||
- vllm/worker/worker.py
|
- vllm/worker/worker.py
|
||||||
- vllm/worker/model_runner.py
|
- vllm/worker/model_runner.py
|
||||||
|
- entrypoints/llm/test_collective_rpc.py
|
||||||
|
- tests/v1/test_async_llm_dp.py
|
||||||
|
- vllm/v1/engine/
|
||||||
commands:
|
commands:
|
||||||
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||||
|
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||||
- pytest -v -s ./compile/test_basic_correctness.py
|
- pytest -v -s ./compile/test_basic_correctness.py
|
||||||
- pytest -v -s ./compile/test_wrapper.py
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
|
||||||
# Avoid importing model tests that cause CUDA reinitialization error
|
# Avoid importing model tests that cause CUDA reinitialization error
|
||||||
|
- pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/encoder_decoder/language/test_bart.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
|
- pytest models/decoder_only/vision_language/test_models.py -v -s -m 'distributed(num_gpus=2)'
|
||||||
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
# this test fails consistently.
|
||||||
|
# TODO: investigate and fix
|
||||||
|
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
|
||||||
|
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||||
|
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
|
||||||
|
|
||||||
|
- label: Plugin Tests (2 GPUs) # 40min
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/plugins/
|
||||||
|
- tests/plugins/
|
||||||
|
commands:
|
||||||
|
# begin platform plugin tests, all the code in-between runs on dummy platform
|
||||||
|
- pip install -e ./plugins/vllm_add_dummy_platform
|
||||||
|
- pytest -v -s plugins_tests/test_platform_plugins.py
|
||||||
|
- pip uninstall vllm_add_dummy_platform -y
|
||||||
|
# end platform plugin tests
|
||||||
|
# other tests continue here:
|
||||||
|
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
||||||
- pip install -e ./plugins/vllm_add_dummy_model
|
- pip install -e ./plugins/vllm_add_dummy_model
|
||||||
- pytest -v -s distributed/test_distributed_oot.py
|
- pytest -v -s distributed/test_distributed_oot.py
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
|
- pytest -v -s models/test_oot_registration.py # it needs a clean process
|
||||||
|
|
||||||
- label: Multi-step Tests (4 GPUs) # 36min
|
- label: Multi-step Tests (4 GPUs) # 36min
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
@ -489,7 +571,9 @@ steps:
|
|||||||
- vllm/engine
|
- vllm/engine
|
||||||
- tests/multi_step
|
- tests/multi_step
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s multi_step/test_correctness_async_llm.py
|
# this test is quite flaky
|
||||||
|
# TODO: investigate and fix.
|
||||||
|
# - pytest -v -s multi_step/test_correctness_async_llm.py
|
||||||
- pytest -v -s multi_step/test_correctness_llm.py
|
- pytest -v -s multi_step/test_correctness_llm.py
|
||||||
|
|
||||||
- label: Pipeline Parallelism Test # 45min
|
- label: Pipeline Parallelism Test # 45min
|
||||||
@ -514,12 +598,12 @@ steps:
|
|||||||
# FIXIT: find out which code initialize cuda before running the test
|
# FIXIT: find out which code initialize cuda before running the test
|
||||||
# before the fix, we need to use spawn to test it
|
# before the fix, we need to use spawn to test it
|
||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
# There is some Tensor Parallelism related processing logic in LoRA that
|
||||||
- pytest -v -s -x lora/test_long_context.py
|
|
||||||
# There is some Tensor Parallelism related processing logic in LoRA that
|
|
||||||
# requires multi-GPU testing for validation.
|
# requires multi-GPU testing for validation.
|
||||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||||
- pytest -v -s -x lora/test_llama_tp.py
|
- pytest -v -s -x lora/test_llama_tp.py
|
||||||
|
- pytest -v -s -x lora/test_minicpmv_tp.py
|
||||||
|
- pytest -v -s -x lora/test_transfomers_model.py
|
||||||
|
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test # 33min
|
- label: Weight Loading Multiple GPU Test # 33min
|
||||||
@ -540,7 +624,7 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/weight_loading
|
- tests/weight_loading
|
||||||
commands:
|
commands:
|
||||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||||
|
|
||||||
|
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
@ -552,7 +636,7 @@ steps:
|
|||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
commands:
|
commands:
|
||||||
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
# NOTE: don't test llama model here, it seems hf implementation is buggy
|
||||||
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
# see https://github.com/vllm-project/vllm/pull/5689 for details
|
||||||
- pytest -v -s distributed/test_custom_all_reduce.py
|
- pytest -v -s distributed/test_custom_all_reduce.py
|
||||||
|
|||||||
@ -50,8 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
|||||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||||
# if $normal_wheel matches cu118, do not upload the index.html
|
# if $normal_wheel matches cu118, do not upload the index.html
|
||||||
echo "Skipping index files for cu118 wheels"
|
echo "Skipping index files for cu118 wheels"
|
||||||
|
elif [[ $normal_wheel == *"cu121"* ]]; then
|
||||||
|
# if $normal_wheel matches cu121, do not upload the index.html
|
||||||
|
echo "Skipping index files for cu121 wheels"
|
||||||
else
|
else
|
||||||
# only upload index.html for cu12 wheels (default wheels)
|
# only upload index.html for cu124 wheels (default wheels)
|
||||||
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||||
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||||
fi
|
fi
|
||||||
@ -63,8 +66,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
|||||||
if [[ $normal_wheel == *"cu118"* ]]; then
|
if [[ $normal_wheel == *"cu118"* ]]; then
|
||||||
# if $normal_wheel matches cu118, do not upload the index.html
|
# if $normal_wheel matches cu118, do not upload the index.html
|
||||||
echo "Skipping index files for cu118 wheels"
|
echo "Skipping index files for cu118 wheels"
|
||||||
|
elif [[ $normal_wheel == *"cu121"* ]]; then
|
||||||
|
# if $normal_wheel matches cu121, do not upload the index.html
|
||||||
|
echo "Skipping index files for cu121 wheels"
|
||||||
else
|
else
|
||||||
# only upload index.html for cu12 wheels (default wheels)
|
# only upload index.html for cu124 wheels (default wheels)
|
||||||
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|||||||
46
.github/CODEOWNERS
vendored
46
.github/CODEOWNERS
vendored
@ -2,32 +2,40 @@
|
|||||||
# for more info about CODEOWNERS file
|
# for more info about CODEOWNERS file
|
||||||
|
|
||||||
# This lists cover the "core" components of vLLM that require careful review
|
# This lists cover the "core" components of vLLM that require careful review
|
||||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/core @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-neuralmagic @comaniac @njhill
|
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
|
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
||||||
|
/vllm/model_executor/guided_decoding @mgoin @russellb
|
||||||
|
/vllm/multimodal @DarkLight1337 @ywang96
|
||||||
CMakeLists.txt @tlrmchlsmth
|
CMakeLists.txt @tlrmchlsmth
|
||||||
|
|
||||||
# vLLM V1
|
# vLLM V1
|
||||||
/vllm/v1 @WoosukKwon @robertgshaw2-neuralmagic @njhill @ywang96 @comaniac @alexm-neuralmagic
|
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||||
|
/vllm/v1/structured_output @mgoin @russellb
|
||||||
|
|
||||||
# Test ownership
|
# Test ownership
|
||||||
/tests/async_engine @njhill @robertgshaw2-neuralmagic @simon-mo
|
|
||||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
|
||||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-neuralmagic @simon-mo
|
|
||||||
/tests/models @DarkLight1337 @ywang96
|
|
||||||
/tests/multimodal @DarkLight1337 @ywang96
|
|
||||||
/tests/prefix_caching @comaniac @KuntaiDu
|
|
||||||
/tests/spec_decode @njhill @LiuXiaoxuanPKU
|
|
||||||
/tests/kernels @tlrmchlsmth @WoosukKwon
|
|
||||||
/tests/quantization @mgoin @robertgshaw2-neuralmagic
|
|
||||||
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
||||||
|
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
|
||||||
|
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
|
||||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
||||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||||
/tests/distributed/test_same_node.py @youkaichao
|
/tests/distributed/test_same_node.py @youkaichao
|
||||||
/tests/multi_step @alexm-neuralmagic @comaniac
|
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
|
||||||
|
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
|
||||||
|
/tests/kernels @tlrmchlsmth @WoosukKwon
|
||||||
|
/tests/model_executor/test_guided_processors.py @mgoin @russellb
|
||||||
|
/tests/models @DarkLight1337 @ywang96
|
||||||
|
/tests/multi_step @alexm-redhat @comaniac
|
||||||
|
/tests/multimodal @DarkLight1337 @ywang96
|
||||||
|
/tests/prefix_caching @comaniac @KuntaiDu
|
||||||
|
/tests/quantization @mgoin @robertgshaw2-redhat
|
||||||
|
/tests/spec_decode @njhill @LiuXiaoxuanPKU
|
||||||
|
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||||
|
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
|
||||||
|
/tests/v1/structured_output @mgoin @russellb
|
||||||
/tests/weight_loading @mgoin @youkaichao
|
/tests/weight_loading @mgoin @youkaichao
|
||||||
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
|
|
||||||
|
|||||||
@ -30,15 +30,6 @@ body:
|
|||||||
</details>
|
</details>
|
||||||
validations:
|
validations:
|
||||||
required: true
|
required: true
|
||||||
- type: textarea
|
|
||||||
attributes:
|
|
||||||
label: Model Input Dumps
|
|
||||||
description: |
|
|
||||||
If you are facing crashing due to illegal memory access or other issues with model execution, vLLM may dump the problematic input of the model. In this case, you will see the message `Error in model execution (input dumped to /tmp/err_xxx.pkl)`. If you see this message, please zip the file (because GitHub doesn't support .pkl file format) and upload it here. This will help us to reproduce the issue and facilitate the debugging process.
|
|
||||||
placeholder: |
|
|
||||||
Upload the dumped input file.
|
|
||||||
validations:
|
|
||||||
required: false
|
|
||||||
- type: textarea
|
- type: textarea
|
||||||
attributes:
|
attributes:
|
||||||
label: 🐛 Describe the bug
|
label: 🐛 Describe the bug
|
||||||
@ -9,7 +9,7 @@ body:
|
|||||||
value: >
|
value: >
|
||||||
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||||
|
|
||||||
#### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model.
|
#### We also highly recommend you read https://docs.vllm.ai/en/latest/contributing/model/adding_model.html first to understand how to add a new model.
|
||||||
- type: textarea
|
- type: textarea
|
||||||
attributes:
|
attributes:
|
||||||
label: The model to consider.
|
label: The model to consider.
|
||||||
28
.github/ISSUE_TEMPLATE/800-misc discussion.yml
vendored
28
.github/ISSUE_TEMPLATE/800-misc discussion.yml
vendored
@ -1,28 +0,0 @@
|
|||||||
name: 🎲 Misc/random discussions that do not fit into the above categories.
|
|
||||||
description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
|
|
||||||
title: "[Misc]: "
|
|
||||||
labels: ["misc"]
|
|
||||||
|
|
||||||
body:
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: >
|
|
||||||
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
|
||||||
- type: textarea
|
|
||||||
attributes:
|
|
||||||
label: Anything you want to discuss about vllm.
|
|
||||||
description: >
|
|
||||||
Anything you want to discuss about vllm.
|
|
||||||
validations:
|
|
||||||
required: true
|
|
||||||
- type: markdown
|
|
||||||
attributes:
|
|
||||||
value: >
|
|
||||||
Thanks for contributing 🎉!
|
|
||||||
- type: checkboxes
|
|
||||||
id: askllm
|
|
||||||
attributes:
|
|
||||||
label: Before submitting a new issue...
|
|
||||||
options:
|
|
||||||
- label: Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.
|
|
||||||
required: true
|
|
||||||
4
.github/ISSUE_TEMPLATE/config.yml
vendored
4
.github/ISSUE_TEMPLATE/config.yml
vendored
@ -1 +1,5 @@
|
|||||||
blank_issues_enabled: false
|
blank_issues_enabled: false
|
||||||
|
contact_links:
|
||||||
|
- name: Questions
|
||||||
|
url: https://discuss.vllm.ai
|
||||||
|
about: Ask questions and discuss with other vLLM community members
|
||||||
|
|||||||
3
.github/PULL_REQUEST_TEMPLATE.md
vendored
3
.github/PULL_REQUEST_TEMPLATE.md
vendored
@ -2,4 +2,5 @@ FILL IN THE PR DESCRIPTION HERE
|
|||||||
|
|
||||||
FIX #xxxx (*link existing issues this PR will resolve*)
|
FIX #xxxx (*link existing issues this PR will resolve*)
|
||||||
|
|
||||||
**BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html **
|
<!--- pyml disable-next-line no-emphasis-as-heading -->
|
||||||
|
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>**
|
||||||
|
|||||||
2
.github/dependabot.yml
vendored
2
.github/dependabot.yml
vendored
@ -23,7 +23,7 @@ updates:
|
|||||||
- dependency-name: "lm-format-enforcer"
|
- dependency-name: "lm-format-enforcer"
|
||||||
- dependency-name: "gguf"
|
- dependency-name: "gguf"
|
||||||
- dependency-name: "compressed-tensors"
|
- dependency-name: "compressed-tensors"
|
||||||
- dependency-name: "ray[adag]"
|
- dependency-name: "ray[cgraph]" # Ray Compiled Graph
|
||||||
- dependency-name: "lm-eval"
|
- dependency-name: "lm-eval"
|
||||||
groups:
|
groups:
|
||||||
minor-update:
|
minor-update:
|
||||||
|
|||||||
85
.github/mergify.yml
vendored
85
.github/mergify.yml
vendored
@ -5,6 +5,7 @@ pull_request_rules:
|
|||||||
- or:
|
- or:
|
||||||
- files~=^[^/]+\.md$
|
- files~=^[^/]+\.md$
|
||||||
- files~=^docs/
|
- files~=^docs/
|
||||||
|
- files~=^examples/
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
add:
|
add:
|
||||||
@ -18,7 +19,7 @@ pull_request_rules:
|
|||||||
- files~=\.buildkite/
|
- files~=\.buildkite/
|
||||||
- files~=^cmake/
|
- files~=^cmake/
|
||||||
- files=CMakeLists.txt
|
- files=CMakeLists.txt
|
||||||
- files~=^Dockerfile
|
- files~=^docker/Dockerfile
|
||||||
- files~=^requirements.*\.txt
|
- files~=^requirements.*\.txt
|
||||||
- files=setup.py
|
- files=setup.py
|
||||||
actions:
|
actions:
|
||||||
@ -35,6 +36,88 @@ pull_request_rules:
|
|||||||
add:
|
add:
|
||||||
- frontend
|
- frontend
|
||||||
|
|
||||||
|
- name: label-multi-modality
|
||||||
|
description: Automatically apply multi-modality label
|
||||||
|
conditions:
|
||||||
|
- or:
|
||||||
|
- files~=^vllm/multimodal/
|
||||||
|
- files~=^tests/multimodal/
|
||||||
|
- files~=^tests/models/multimodal/
|
||||||
|
- files~=^tests/models/*/audio_language/
|
||||||
|
- files~=^tests/models/*/vision_language/
|
||||||
|
- files=tests/models/test_vision.py
|
||||||
|
actions:
|
||||||
|
label:
|
||||||
|
add:
|
||||||
|
- multi-modality
|
||||||
|
|
||||||
|
- name: label-structured-output
|
||||||
|
description: Automatically apply structured-output label
|
||||||
|
conditions:
|
||||||
|
- or:
|
||||||
|
- files~=^vllm/model_executor/guided_decoding/
|
||||||
|
- files=tests/model_executor/test_guided_processors.py
|
||||||
|
- files=tests/entrypoints/llm/test_guided_generate.py
|
||||||
|
- files=benchmarks/benchmark_serving_guided.py
|
||||||
|
- files=benchmarks/benchmark_guided.py
|
||||||
|
actions:
|
||||||
|
label:
|
||||||
|
add:
|
||||||
|
- structured-output
|
||||||
|
|
||||||
|
- name: label-speculative-decoding
|
||||||
|
description: Automatically apply speculative-decoding label
|
||||||
|
conditions:
|
||||||
|
- or:
|
||||||
|
- files~=^vllm/spec_decode/
|
||||||
|
- files=vllm/model_executor/layers/spec_decode_base_sampler.py
|
||||||
|
- files~=^tests/spec_decode/
|
||||||
|
actions:
|
||||||
|
label:
|
||||||
|
add:
|
||||||
|
- speculative-decoding
|
||||||
|
|
||||||
|
- name: label-v1
|
||||||
|
description: Automatically apply v1 label
|
||||||
|
conditions:
|
||||||
|
- or:
|
||||||
|
- files~=^vllm/v1/
|
||||||
|
- files~=^tests/v1/
|
||||||
|
actions:
|
||||||
|
label:
|
||||||
|
add:
|
||||||
|
- v1
|
||||||
|
|
||||||
|
- name: label-tpu
|
||||||
|
description: Automatically apply tpu label
|
||||||
|
# Keep this list in sync with `label-tpu-remove` conditions
|
||||||
|
conditions:
|
||||||
|
- or:
|
||||||
|
- files~=tpu.py
|
||||||
|
- files~=_tpu
|
||||||
|
- files~=tpu_
|
||||||
|
- files~=/tpu/
|
||||||
|
- files~=pallas
|
||||||
|
actions:
|
||||||
|
label:
|
||||||
|
add:
|
||||||
|
- tpu
|
||||||
|
|
||||||
|
- name: label-tpu-remove
|
||||||
|
description: Automatically remove tpu label
|
||||||
|
# Keep this list in sync with `label-tpu` conditions
|
||||||
|
conditions:
|
||||||
|
- and:
|
||||||
|
- -files~=tpu.py
|
||||||
|
- -files~=_tpu
|
||||||
|
- -files~=tpu_
|
||||||
|
- -files~=/tpu/
|
||||||
|
- -files~=pallas
|
||||||
|
actions:
|
||||||
|
label:
|
||||||
|
remove:
|
||||||
|
- tpu
|
||||||
|
|
||||||
- name: ping author on conflicts and add 'needs-rebase' label
|
- name: ping author on conflicts and add 'needs-rebase' label
|
||||||
conditions:
|
conditions:
|
||||||
- conflict
|
- conflict
|
||||||
|
|||||||
40
.github/workflows/actionlint.yml
vendored
40
.github/workflows/actionlint.yml
vendored
@ -1,40 +0,0 @@
|
|||||||
name: Lint GitHub Actions workflows
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- "main"
|
|
||||||
paths:
|
|
||||||
- '.github/workflows/*.ya?ml'
|
|
||||||
- '.github/workflows/actionlint.*'
|
|
||||||
- '.github/workflows/matchers/actionlint.json'
|
|
||||||
pull_request:
|
|
||||||
branches:
|
|
||||||
- "main"
|
|
||||||
paths:
|
|
||||||
- '.github/workflows/*.ya?ml'
|
|
||||||
- '.github/workflows/actionlint.*'
|
|
||||||
- '.github/workflows/matchers/actionlint.json'
|
|
||||||
|
|
||||||
env:
|
|
||||||
LC_ALL: en_US.UTF-8
|
|
||||||
|
|
||||||
defaults:
|
|
||||||
run:
|
|
||||||
shell: bash
|
|
||||||
|
|
||||||
permissions:
|
|
||||||
contents: read
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
actionlint:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- name: "Checkout"
|
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
|
|
||||||
- name: "Run actionlint"
|
|
||||||
run: |
|
|
||||||
echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
|
||||||
tools/actionlint.sh -color
|
|
||||||
53
.github/workflows/clang-format.yml
vendored
53
.github/workflows/clang-format.yml
vendored
@ -1,53 +0,0 @@
|
|||||||
name: clang-format
|
|
||||||
|
|
||||||
on:
|
|
||||||
# Trigger the workflow on push or pull request,
|
|
||||||
# but only for the main branch
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
paths:
|
|
||||||
- '**/*.h'
|
|
||||||
- '**/*.cpp'
|
|
||||||
- '**/*.cu'
|
|
||||||
- '**/*.cuh'
|
|
||||||
- '.github/workflows/clang-format.yml'
|
|
||||||
pull_request:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
paths:
|
|
||||||
- '**/*.h'
|
|
||||||
- '**/*.cpp'
|
|
||||||
- '**/*.cu'
|
|
||||||
- '**/*.cuh'
|
|
||||||
- '.github/workflows/clang-format.yml'
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
clang-format:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
python-version: ["3.11"]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
pip install clang-format==18.1.5
|
|
||||||
- name: Running clang-format
|
|
||||||
run: |
|
|
||||||
EXCLUDES=(
|
|
||||||
'csrc/moe/topk_softmax_kernels.cu'
|
|
||||||
'csrc/quantization/gguf/ggml-common.h'
|
|
||||||
'csrc/quantization/gguf/dequantize.cuh'
|
|
||||||
'csrc/quantization/gguf/vecdotq.cuh'
|
|
||||||
'csrc/quantization/gguf/mmq.cuh'
|
|
||||||
'csrc/quantization/gguf/mmvq.cuh'
|
|
||||||
)
|
|
||||||
find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
|
|
||||||
| grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
|
|
||||||
| xargs clang-format --dry-run --Werror
|
|
||||||
2
.github/workflows/cleanup_pr_body.yml
vendored
2
.github/workflows/cleanup_pr_body.yml
vendored
@ -16,7 +16,7 @@ jobs:
|
|||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
|
||||||
- name: Set up Python
|
- name: Set up Python
|
||||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
|
||||||
with:
|
with:
|
||||||
python-version: '3.12'
|
python-version: '3.12'
|
||||||
|
|
||||||
|
|||||||
45
.github/workflows/codespell.yml
vendored
45
.github/workflows/codespell.yml
vendored
@ -1,45 +0,0 @@
|
|||||||
name: codespell
|
|
||||||
|
|
||||||
on:
|
|
||||||
# Trigger the workflow on push or pull request,
|
|
||||||
# but only for the main branch
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
paths:
|
|
||||||
- "**/*.py"
|
|
||||||
- "**/*.md"
|
|
||||||
- "**/*.rst"
|
|
||||||
- pyproject.toml
|
|
||||||
- requirements-lint.txt
|
|
||||||
- .github/workflows/codespell.yml
|
|
||||||
pull_request:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
paths:
|
|
||||||
- "**/*.py"
|
|
||||||
- "**/*.md"
|
|
||||||
- "**/*.rst"
|
|
||||||
- pyproject.toml
|
|
||||||
- requirements-lint.txt
|
|
||||||
- .github/workflows/codespell.yml
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
codespell:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
python-version: ["3.12"]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
pip install -r requirements-lint.txt
|
|
||||||
- name: Spelling check with codespell
|
|
||||||
run: |
|
|
||||||
codespell --toml pyproject.toml
|
|
||||||
15
.github/workflows/lint-and-deploy.yaml
vendored
15
.github/workflows/lint-and-deploy.yaml
vendored
@ -12,22 +12,22 @@ jobs:
|
|||||||
fetch-depth: 0
|
fetch-depth: 0
|
||||||
|
|
||||||
- name: Set up Helm
|
- name: Set up Helm
|
||||||
uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 # v4.2.0
|
uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0
|
||||||
with:
|
with:
|
||||||
version: v3.14.4
|
version: v3.14.4
|
||||||
|
|
||||||
#Python is required because ct lint runs Yamale and yamllint which require Python.
|
#Python is required because ct lint runs Yamale and yamllint which require Python.
|
||||||
- uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
|
||||||
with:
|
with:
|
||||||
python-version: '3.13'
|
python-version: '3.13'
|
||||||
|
|
||||||
- name: Set up chart-testing
|
- name: Set up chart-testing
|
||||||
uses: helm/chart-testing-action@e6669bcd63d7cb57cb4380c33043eebe5d111992 # v2.6.1
|
uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b # v2.7.0
|
||||||
with:
|
with:
|
||||||
version: v3.10.1
|
version: v3.10.1
|
||||||
|
|
||||||
- name: Run chart-testing (lint)
|
- name: Run chart-testing (lint)
|
||||||
run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/chart-helm --charts examples/chart-helm
|
run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm
|
||||||
|
|
||||||
- name: Setup minio
|
- name: Setup minio
|
||||||
run: |
|
run: |
|
||||||
@ -47,10 +47,10 @@ jobs:
|
|||||||
aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
|
aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
|
||||||
|
|
||||||
- name: Create kind cluster
|
- name: Create kind cluster
|
||||||
uses: helm/kind-action@0025e74a8c7512023d06dc019c617aa3cf561fde # v1.10.0
|
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
|
||||||
|
|
||||||
- name: Build the Docker image vllm cpu
|
- name: Build the Docker image vllm cpu
|
||||||
run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .
|
run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
|
||||||
|
|
||||||
- name: Configuration of docker images, network and namespace for the kind cluster
|
- name: Configuration of docker images, network and namespace for the kind cluster
|
||||||
run: |
|
run: |
|
||||||
@ -64,7 +64,8 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
export AWS_ACCESS_KEY_ID=minioadmin
|
export AWS_ACCESS_KEY_ID=minioadmin
|
||||||
export AWS_SECRET_ACCESS_KEY=minioadmin
|
export AWS_SECRET_ACCESS_KEY=minioadmin
|
||||||
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/chart-helm -f examples/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
|
sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
|
||||||
|
helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
|
||||||
|
|
||||||
- name: curl test
|
- name: curl test
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
17
.github/workflows/matchers/ruff.json
vendored
17
.github/workflows/matchers/ruff.json
vendored
@ -1,17 +0,0 @@
|
|||||||
{
|
|
||||||
"problemMatcher": [
|
|
||||||
{
|
|
||||||
"owner": "ruff",
|
|
||||||
"pattern": [
|
|
||||||
{
|
|
||||||
"regexp": "^(.+?):(\\d+):(\\d+): (\\w+): (.+)$",
|
|
||||||
"file": 1,
|
|
||||||
"line": 2,
|
|
||||||
"column": 3,
|
|
||||||
"code": 4,
|
|
||||||
"message": 5
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
51
.github/workflows/mypy.yaml
vendored
51
.github/workflows/mypy.yaml
vendored
@ -1,51 +0,0 @@
|
|||||||
name: mypy
|
|
||||||
|
|
||||||
on:
|
|
||||||
# Trigger the workflow on push or pull request,
|
|
||||||
# but only for the main branch
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
paths:
|
|
||||||
- '**/*.py'
|
|
||||||
- '.github/workflows/mypy.yaml'
|
|
||||||
- 'tools/mypy.sh'
|
|
||||||
- 'pyproject.toml'
|
|
||||||
pull_request:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
# This workflow is only relevant when one of the following files changes.
|
|
||||||
# However, we have github configured to expect and require this workflow
|
|
||||||
# to run and pass before github with auto-merge a pull request. Until github
|
|
||||||
# allows more flexible auto-merge policy, we can just run this on every PR.
|
|
||||||
# It doesn't take that long to run, anyway.
|
|
||||||
#paths:
|
|
||||||
# - '**/*.py'
|
|
||||||
# - '.github/workflows/mypy.yaml'
|
|
||||||
# - 'tools/mypy.sh'
|
|
||||||
# - 'pyproject.toml'
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
mypy:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
python-version: ["3.9", "3.10", "3.11", "3.12"]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
pip install mypy==1.11.1
|
|
||||||
pip install types-setuptools
|
|
||||||
pip install types-PyYAML
|
|
||||||
pip install types-requests
|
|
||||||
pip install types-setuptools
|
|
||||||
- name: Mypy
|
|
||||||
run: |
|
|
||||||
echo "::add-matcher::.github/workflows/matchers/mypy.json"
|
|
||||||
tools/mypy.sh 1 ${{ matrix.python-version }}
|
|
||||||
37
.github/workflows/png-lint.yml
vendored
37
.github/workflows/png-lint.yml
vendored
@ -1,37 +0,0 @@
|
|||||||
name: Lint PNG exports from excalidraw
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- "main"
|
|
||||||
paths:
|
|
||||||
- '*.excalidraw.png'
|
|
||||||
- '.github/workflows/png-lint.yml'
|
|
||||||
pull_request:
|
|
||||||
branches:
|
|
||||||
- "main"
|
|
||||||
paths:
|
|
||||||
- '*.excalidraw.png'
|
|
||||||
- '.github/workflows/png-lint.yml'
|
|
||||||
|
|
||||||
env:
|
|
||||||
LC_ALL: en_US.UTF-8
|
|
||||||
|
|
||||||
defaults:
|
|
||||||
run:
|
|
||||||
shell: bash
|
|
||||||
|
|
||||||
permissions:
|
|
||||||
contents: read
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
actionlint:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- name: "Checkout"
|
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
|
|
||||||
- name: "Run png-lint.sh to check excalidraw exported images"
|
|
||||||
run: |
|
|
||||||
tools/png-lint.sh
|
|
||||||
20
.github/workflows/pre-commit.yml
vendored
Normal file
20
.github/workflows/pre-commit.yml
vendored
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
name: pre-commit
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
push:
|
||||||
|
branches: [main]
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
pre-commit:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
||||||
|
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
|
||||||
|
with:
|
||||||
|
python-version: "3.12"
|
||||||
|
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
|
||||||
|
- run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
|
||||||
|
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
|
||||||
|
with:
|
||||||
|
extra_args: --all-files --hook-stage manual
|
||||||
4
.github/workflows/publish.yml
vendored
4
.github/workflows/publish.yml
vendored
@ -39,7 +39,7 @@ jobs:
|
|||||||
const script = require('.github/workflows/scripts/create_release.js')
|
const script = require('.github/workflows/scripts/create_release.js')
|
||||||
await script(github, context, core)
|
await script(github, context, core)
|
||||||
|
|
||||||
# NOTE(simon): No longer build wheel using Github Actions. See buildkite's release workflow.
|
# NOTE(simon): No longer build wheel using GitHub Actions. See buildkite's release workflow.
|
||||||
# wheel:
|
# wheel:
|
||||||
# name: Build Wheel
|
# name: Build Wheel
|
||||||
# runs-on: ${{ matrix.os }}
|
# runs-on: ${{ matrix.os }}
|
||||||
@ -50,7 +50,7 @@ jobs:
|
|||||||
# matrix:
|
# matrix:
|
||||||
# os: ['ubuntu-20.04']
|
# os: ['ubuntu-20.04']
|
||||||
# python-version: ['3.9', '3.10', '3.11', '3.12']
|
# python-version: ['3.9', '3.10', '3.11', '3.12']
|
||||||
# pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt.
|
# pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements/cuda.txt.
|
||||||
# cuda-version: ['11.8', '12.1']
|
# cuda-version: ['11.8', '12.1']
|
||||||
|
|
||||||
# steps:
|
# steps:
|
||||||
|
|||||||
8
.github/workflows/reminder_comment.yml
vendored
8
.github/workflows/reminder_comment.yml
vendored
@ -2,7 +2,6 @@ name: PR Reminder Comment Bot
|
|||||||
on:
|
on:
|
||||||
pull_request_target:
|
pull_request_target:
|
||||||
types: [opened]
|
types: [opened]
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
pr_reminder:
|
pr_reminder:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
@ -15,7 +14,12 @@ jobs:
|
|||||||
owner: context.repo.owner,
|
owner: context.repo.owner,
|
||||||
repo: context.repo.repo,
|
repo: context.repo.repo,
|
||||||
issue_number: context.issue.number,
|
issue_number: context.issue.number,
|
||||||
body: '👋 Hi! Thank you for contributing to the vLLM project.\n Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org. \n\nOnce the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n To run CI, PR reviewers can do one of these:\n- Add `ready` label to the PR\n- Enable auto-merge.\n\n🚀'
|
body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
|
||||||
|
'💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
|
||||||
|
'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' +
|
||||||
|
'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
|
||||||
|
'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
|
||||||
|
'🚀'
|
||||||
})
|
})
|
||||||
env:
|
env:
|
||||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|||||||
52
.github/workflows/ruff.yml
vendored
52
.github/workflows/ruff.yml
vendored
@ -1,52 +0,0 @@
|
|||||||
name: ruff
|
|
||||||
|
|
||||||
on:
|
|
||||||
# Trigger the workflow on push or pull request,
|
|
||||||
# but only for the main branch
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
paths:
|
|
||||||
- "**/*.py"
|
|
||||||
- pyproject.toml
|
|
||||||
- requirements-lint.txt
|
|
||||||
- .github/workflows/matchers/ruff.json
|
|
||||||
- .github/workflows/ruff.yml
|
|
||||||
pull_request:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
# This workflow is only relevant when one of the following files changes.
|
|
||||||
# However, we have github configured to expect and require this workflow
|
|
||||||
# to run and pass before github with auto-merge a pull request. Until github
|
|
||||||
# allows more flexible auto-merge policy, we can just run this on every PR.
|
|
||||||
# It doesn't take that long to run, anyway.
|
|
||||||
#paths:
|
|
||||||
# - "**/*.py"
|
|
||||||
# - pyproject.toml
|
|
||||||
# - requirements-lint.txt
|
|
||||||
# - .github/workflows/matchers/ruff.json
|
|
||||||
# - .github/workflows/ruff.yml
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
ruff:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
python-version: ["3.12"]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
pip install -r requirements-lint.txt
|
|
||||||
- name: Analysing the code with ruff
|
|
||||||
run: |
|
|
||||||
echo "::add-matcher::.github/workflows/matchers/ruff.json"
|
|
||||||
ruff check --output-format github .
|
|
||||||
- name: Run isort
|
|
||||||
run: |
|
|
||||||
isort . --check-only
|
|
||||||
2
.github/workflows/scripts/build.sh
vendored
2
.github/workflows/scripts/build.sh
vendored
@ -9,7 +9,7 @@ PATH=${cuda_home}/bin:$PATH
|
|||||||
LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
|
LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
|
||||||
|
|
||||||
# Install requirements
|
# Install requirements
|
||||||
$python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt
|
$python_executable -m pip install -r requirements/build.txt -r requirements/cuda.txt
|
||||||
|
|
||||||
# Limit the number of parallel jobs to avoid OOM
|
# Limit the number of parallel jobs to avoid OOM
|
||||||
export MAX_JOBS=1
|
export MAX_JOBS=1
|
||||||
|
|||||||
2
.github/workflows/scripts/create_release.js
vendored
2
.github/workflows/scripts/create_release.js
vendored
@ -1,4 +1,4 @@
|
|||||||
// Uses Github's API to create the release and wait for result.
|
// Uses GitHub's API to create the release and wait for result.
|
||||||
// We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
|
// We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
|
||||||
|
|
||||||
module.exports = async (github, context, core) => {
|
module.exports = async (github, context, core) => {
|
||||||
|
|||||||
37
.github/workflows/shellcheck.yml
vendored
37
.github/workflows/shellcheck.yml
vendored
@ -1,37 +0,0 @@
|
|||||||
name: Lint shell scripts
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- "main"
|
|
||||||
paths:
|
|
||||||
- '**/*.sh'
|
|
||||||
- '.github/workflows/shellcheck.yml'
|
|
||||||
pull_request:
|
|
||||||
branches:
|
|
||||||
- "main"
|
|
||||||
paths:
|
|
||||||
- '**/*.sh'
|
|
||||||
- '.github/workflows/shellcheck.yml'
|
|
||||||
|
|
||||||
env:
|
|
||||||
LC_ALL: en_US.UTF-8
|
|
||||||
|
|
||||||
defaults:
|
|
||||||
run:
|
|
||||||
shell: bash
|
|
||||||
|
|
||||||
permissions:
|
|
||||||
contents: read
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
shellcheck:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
steps:
|
|
||||||
- name: "Checkout"
|
|
||||||
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
|
|
||||||
- name: "Check shell scripts"
|
|
||||||
run: |
|
|
||||||
tools/shellcheck.sh
|
|
||||||
32
.github/workflows/sphinx-lint.yml
vendored
32
.github/workflows/sphinx-lint.yml
vendored
@ -1,32 +0,0 @@
|
|||||||
name: Lint documentation
|
|
||||||
|
|
||||||
on:
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
paths:
|
|
||||||
- "docs/**"
|
|
||||||
pull_request:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
paths:
|
|
||||||
- "docs/**"
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
sphinx-lint:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
python-version: ["3.12"]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
pip install -r requirements-lint.txt
|
|
||||||
- name: Linting docs
|
|
||||||
run: tools/sphinx-lint.sh
|
|
||||||
2
.github/workflows/stale.yml
vendored
2
.github/workflows/stale.yml
vendored
@ -13,7 +13,7 @@ jobs:
|
|||||||
actions: write
|
actions: write
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
|
- uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
|
||||||
with:
|
with:
|
||||||
# Increasing this value ensures that changes to this workflow
|
# Increasing this value ensures that changes to this workflow
|
||||||
# propagate to all issues and PRs in days rather than months
|
# propagate to all issues and PRs in days rather than months
|
||||||
|
|||||||
38
.github/workflows/yapf.yml
vendored
38
.github/workflows/yapf.yml
vendored
@ -1,38 +0,0 @@
|
|||||||
name: yapf
|
|
||||||
|
|
||||||
on:
|
|
||||||
# Trigger the workflow on push or pull request,
|
|
||||||
# but only for the main branch
|
|
||||||
push:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
paths:
|
|
||||||
- "**/*.py"
|
|
||||||
- .github/workflows/yapf.yml
|
|
||||||
pull_request:
|
|
||||||
branches:
|
|
||||||
- main
|
|
||||||
paths:
|
|
||||||
- "**/*.py"
|
|
||||||
- .github/workflows/yapf.yml
|
|
||||||
|
|
||||||
jobs:
|
|
||||||
yapf:
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
python-version: ["3.12"]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
|
||||||
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
pip install yapf==0.32.0
|
|
||||||
pip install toml==0.10.2
|
|
||||||
- name: Running yapf
|
|
||||||
run: |
|
|
||||||
yapf --diff --recursive .
|
|
||||||
10
.gitignore
vendored
10
.gitignore
vendored
@ -2,7 +2,8 @@
|
|||||||
/vllm/_version.py
|
/vllm/_version.py
|
||||||
|
|
||||||
# vllm-flash-attn built from source
|
# vllm-flash-attn built from source
|
||||||
vllm/vllm_flash_attn/
|
vllm/vllm_flash_attn/*
|
||||||
|
!vllm/vllm_flash_attn/fa_utils.py
|
||||||
|
|
||||||
# Byte-compiled / optimized / DLL files
|
# Byte-compiled / optimized / DLL files
|
||||||
__pycache__/
|
__pycache__/
|
||||||
@ -79,10 +80,7 @@ instance/
|
|||||||
|
|
||||||
# Sphinx documentation
|
# Sphinx documentation
|
||||||
docs/_build/
|
docs/_build/
|
||||||
docs/source/getting_started/examples/*.rst
|
docs/source/getting_started/examples/
|
||||||
!**/*.template.rst
|
|
||||||
docs/source/getting_started/examples/*.md
|
|
||||||
!**/*.template.md
|
|
||||||
|
|
||||||
# PyBuilder
|
# PyBuilder
|
||||||
.pybuilder/
|
.pybuilder/
|
||||||
@ -200,7 +198,7 @@ _build/
|
|||||||
hip_compat.h
|
hip_compat.h
|
||||||
|
|
||||||
# Benchmark dataset
|
# Benchmark dataset
|
||||||
benchmarks/*.json
|
benchmarks/**/*.json
|
||||||
|
|
||||||
# Linting
|
# Linting
|
||||||
actionlint
|
actionlint
|
||||||
|
|||||||
132
.pre-commit-config.yaml
Normal file
132
.pre-commit-config.yaml
Normal file
@ -0,0 +1,132 @@
|
|||||||
|
default_install_hook_types:
|
||||||
|
- pre-commit
|
||||||
|
- commit-msg
|
||||||
|
default_stages:
|
||||||
|
- pre-commit # Run locally
|
||||||
|
- manual # Run in CI
|
||||||
|
exclude: 'vllm/third_party/.*'
|
||||||
|
repos:
|
||||||
|
- repo: https://github.com/google/yapf
|
||||||
|
rev: v0.43.0
|
||||||
|
hooks:
|
||||||
|
- id: yapf
|
||||||
|
args: [--in-place, --verbose]
|
||||||
|
additional_dependencies: [toml] # TODO: Remove when yapf is upgraded
|
||||||
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
|
rev: v0.9.3
|
||||||
|
hooks:
|
||||||
|
- id: ruff
|
||||||
|
args: [--output-format, github, --fix]
|
||||||
|
- repo: https://github.com/codespell-project/codespell
|
||||||
|
rev: v2.4.0
|
||||||
|
hooks:
|
||||||
|
- id: codespell
|
||||||
|
additional_dependencies: ['tomli']
|
||||||
|
args: ['--toml', 'pyproject.toml']
|
||||||
|
- repo: https://github.com/PyCQA/isort
|
||||||
|
rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0
|
||||||
|
hooks:
|
||||||
|
- id: isort
|
||||||
|
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||||
|
rev: v19.1.7
|
||||||
|
hooks:
|
||||||
|
- id: clang-format
|
||||||
|
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
||||||
|
types_or: [c++, cuda]
|
||||||
|
args: [--style=file, --verbose]
|
||||||
|
- repo: https://github.com/jackdewinter/pymarkdown
|
||||||
|
rev: v0.9.27
|
||||||
|
hooks:
|
||||||
|
- id: pymarkdown
|
||||||
|
args: [fix]
|
||||||
|
- repo: https://github.com/rhysd/actionlint
|
||||||
|
rev: v1.7.7
|
||||||
|
hooks:
|
||||||
|
- id: actionlint
|
||||||
|
- repo: https://github.com/astral-sh/uv-pre-commit
|
||||||
|
rev: 0.6.2
|
||||||
|
hooks:
|
||||||
|
- id: pip-compile
|
||||||
|
args: [requirements/test.in, -o, requirements/test.txt]
|
||||||
|
files: ^requirements/test\.(in|txt)$
|
||||||
|
- repo: local
|
||||||
|
hooks:
|
||||||
|
- id: mypy-local
|
||||||
|
name: Run mypy for local Python installation
|
||||||
|
entry: tools/mypy.sh 0 "local"
|
||||||
|
language: python
|
||||||
|
types: [python]
|
||||||
|
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
|
||||||
|
stages: [pre-commit] # Don't run in CI
|
||||||
|
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
|
name: Run mypy for Python 3.9
|
||||||
|
entry: tools/mypy.sh 1 "3.9"
|
||||||
|
language: python
|
||||||
|
types: [python]
|
||||||
|
additional_dependencies: *mypy_deps
|
||||||
|
stages: [manual] # Only run in CI
|
||||||
|
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
|
name: Run mypy for Python 3.10
|
||||||
|
entry: tools/mypy.sh 1 "3.10"
|
||||||
|
language: python
|
||||||
|
types: [python]
|
||||||
|
additional_dependencies: *mypy_deps
|
||||||
|
stages: [manual] # Only run in CI
|
||||||
|
- id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
|
name: Run mypy for Python 3.11
|
||||||
|
entry: tools/mypy.sh 1 "3.11"
|
||||||
|
language: python
|
||||||
|
types: [python]
|
||||||
|
additional_dependencies: *mypy_deps
|
||||||
|
stages: [manual] # Only run in CI
|
||||||
|
- id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
|
name: Run mypy for Python 3.12
|
||||||
|
entry: tools/mypy.sh 1 "3.12"
|
||||||
|
language: python
|
||||||
|
types: [python]
|
||||||
|
additional_dependencies: *mypy_deps
|
||||||
|
stages: [manual] # Only run in CI
|
||||||
|
- id: shellcheck
|
||||||
|
name: Lint shell scripts
|
||||||
|
entry: tools/shellcheck.sh
|
||||||
|
language: script
|
||||||
|
types: [shell]
|
||||||
|
- id: png-lint
|
||||||
|
name: Lint PNG exports from excalidraw
|
||||||
|
entry: tools/png-lint.sh
|
||||||
|
language: script
|
||||||
|
types: [png]
|
||||||
|
- id: signoff-commit
|
||||||
|
name: Sign-off Commit
|
||||||
|
entry: bash
|
||||||
|
args:
|
||||||
|
- -c
|
||||||
|
- |
|
||||||
|
if ! grep -q "^Signed-off-by: $(git config user.name) <$(git config user.email)>" .git/COMMIT_EDITMSG; then
|
||||||
|
printf "\nSigned-off-by: $(git config user.name) <$(git config user.email)>\n" >> .git/COMMIT_EDITMSG
|
||||||
|
fi
|
||||||
|
language: system
|
||||||
|
verbose: true
|
||||||
|
stages: [commit-msg]
|
||||||
|
- id: check-spdx-header
|
||||||
|
name: Check SPDX headers
|
||||||
|
entry: python tools/check_spdx_header.py
|
||||||
|
language: python
|
||||||
|
types: [python]
|
||||||
|
- id: check-filenames
|
||||||
|
name: Check for spaces in all filenames
|
||||||
|
entry: bash
|
||||||
|
args:
|
||||||
|
- -c
|
||||||
|
- 'git ls-files | grep " " && echo "Filenames should not contain spaces!" && exit 1 || exit 0'
|
||||||
|
language: system
|
||||||
|
always_run: true
|
||||||
|
pass_filenames: false
|
||||||
|
# Keep `suggestion` last
|
||||||
|
- id: suggestion
|
||||||
|
name: Suggestion
|
||||||
|
entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
|
||||||
|
language: system
|
||||||
|
verbose: true
|
||||||
|
pass_filenames: false
|
||||||
|
# Insert new entries above the `suggestion` entry
|
||||||
@ -18,4 +18,4 @@ formats: []
|
|||||||
# Optionally declare the Python requirements required to build your docs
|
# Optionally declare the Python requirements required to build your docs
|
||||||
python:
|
python:
|
||||||
install:
|
install:
|
||||||
- requirements: docs/requirements-docs.txt
|
- requirements: requirements/docs.txt
|
||||||
|
|||||||
304
CMakeLists.txt
304
CMakeLists.txt
@ -24,9 +24,6 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
|
|||||||
# Suppress potential warnings about unused manually-specified variables
|
# Suppress potential warnings about unused manually-specified variables
|
||||||
set(ignoreMe "${VLLM_PYTHON_PATH}")
|
set(ignoreMe "${VLLM_PYTHON_PATH}")
|
||||||
|
|
||||||
# Prevent installation of dependencies (cutlass) by default.
|
|
||||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
|
||||||
|
|
||||||
#
|
#
|
||||||
# Supported python versions. These versions will be searched in order, the
|
# Supported python versions. These versions will be searched in order, the
|
||||||
# first match will be selected. These should be kept in sync with setup.py.
|
# first match will be selected. These should be kept in sync with setup.py.
|
||||||
@ -34,10 +31,10 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
|||||||
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
|
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
|
||||||
|
|
||||||
# Supported NVIDIA architectures.
|
# Supported NVIDIA architectures.
|
||||||
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
|
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
|
||||||
|
|
||||||
# Supported AMD GPU architectures.
|
# Supported AMD GPU architectures.
|
||||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")
|
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Supported/expected torch versions for CUDA/ROCm.
|
# Supported/expected torch versions for CUDA/ROCm.
|
||||||
@ -47,10 +44,10 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
|
|||||||
#
|
#
|
||||||
# Note: the CUDA torch version is derived from pyproject.toml and various
|
# Note: the CUDA torch version is derived from pyproject.toml and various
|
||||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||||
# versions are derived from Dockerfile.rocm
|
# versions are derived from docker/Dockerfile.rocm
|
||||||
#
|
#
|
||||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
|
set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
|
||||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")
|
set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Try to find python package with an executable that exactly matches
|
# Try to find python package with an executable that exactly matches
|
||||||
@ -177,10 +174,54 @@ include(FetchContent)
|
|||||||
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
|
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
|
||||||
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
|
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
|
||||||
|
|
||||||
|
#
|
||||||
|
# Set rocm version dev int.
|
||||||
|
#
|
||||||
|
if(VLLM_GPU_LANG STREQUAL "HIP")
|
||||||
|
#
|
||||||
|
# Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
|
||||||
|
#
|
||||||
|
set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
|
||||||
|
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
|
||||||
|
# a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
|
||||||
|
#
|
||||||
|
set(CMAKE_${VLLM_GPU_LANG}_FLAGS "${CMAKE_${VLLM_GPU_LANG}_FLAGS} -Wno-unused-result")
|
||||||
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result")
|
||||||
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# Define other extension targets
|
# Define other extension targets
|
||||||
#
|
#
|
||||||
|
|
||||||
|
#
|
||||||
|
# cumem_allocator extension
|
||||||
|
#
|
||||||
|
|
||||||
|
set(VLLM_CUMEM_EXT_SRC
|
||||||
|
"csrc/cumem_allocator.cpp")
|
||||||
|
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${VLLM_CUMEM_EXT_SRC}"
|
||||||
|
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||||
|
|
||||||
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
|
message(STATUS "Enabling cumem allocator extension.")
|
||||||
|
# link against cuda driver library
|
||||||
|
list(APPEND CUMEM_LIBS CUDA::cuda_driver)
|
||||||
|
define_gpu_extension_target(
|
||||||
|
cumem_allocator
|
||||||
|
DESTINATION vllm
|
||||||
|
LANGUAGE CXX
|
||||||
|
SOURCES ${VLLM_CUMEM_EXT_SRC}
|
||||||
|
LIBRARIES ${CUMEM_LIBS}
|
||||||
|
USE_SABI 3.8
|
||||||
|
WITH_SOABI)
|
||||||
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# _C extension
|
# _C extension
|
||||||
#
|
#
|
||||||
@ -193,6 +234,7 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/activation_kernels.cu"
|
"csrc/activation_kernels.cu"
|
||||||
"csrc/layernorm_kernels.cu"
|
"csrc/layernorm_kernels.cu"
|
||||||
"csrc/layernorm_quant_kernels.cu"
|
"csrc/layernorm_quant_kernels.cu"
|
||||||
|
"csrc/cuda_view.cu"
|
||||||
"csrc/quantization/gptq/q_gemm.cu"
|
"csrc/quantization/gptq/q_gemm.cu"
|
||||||
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
||||||
"csrc/quantization/fp8/common.cu"
|
"csrc/quantization/fp8/common.cu"
|
||||||
@ -200,13 +242,15 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/quantization/gguf/gguf_kernel.cu"
|
"csrc/quantization/gguf/gguf_kernel.cu"
|
||||||
"csrc/cuda_utils_kernels.cu"
|
"csrc/cuda_utils_kernels.cu"
|
||||||
"csrc/prepare_inputs/advance_step.cu"
|
"csrc/prepare_inputs/advance_step.cu"
|
||||||
|
"csrc/custom_all_reduce.cu"
|
||||||
"csrc/torch_bindings.cpp")
|
"csrc/torch_bindings.cpp")
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
|
||||||
|
|
||||||
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
|
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
|
||||||
set(CUTLASS_REVISION "v3.6.0" CACHE STRING "CUTLASS revision to use")
|
# Please keep this in sync with FetchContent_Declare line below.
|
||||||
|
set(CUTLASS_REVISION "v3.8.0" CACHE STRING "CUTLASS revision to use")
|
||||||
|
|
||||||
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
|
||||||
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
|
||||||
@ -223,13 +267,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
FetchContent_Declare(
|
FetchContent_Declare(
|
||||||
cutlass
|
cutlass
|
||||||
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
||||||
GIT_TAG 8aa95dbb888be6d81c6fbf7169718c5244b53227
|
# Please keep this in sync with CUTLASS_REVISION line above.
|
||||||
|
GIT_TAG v3.8.0
|
||||||
GIT_PROGRESS TRUE
|
GIT_PROGRESS TRUE
|
||||||
|
|
||||||
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
|
||||||
# Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
|
# Important: If GIT_SHALLOW is enabled then GIT_TAG works only with branch names and tags.
|
||||||
# So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
|
# So if the GIT_TAG above is updated to a commit hash, GIT_SHALLOW must be set to FALSE
|
||||||
GIT_SHALLOW FALSE
|
GIT_SHALLOW TRUE
|
||||||
)
|
)
|
||||||
endif()
|
endif()
|
||||||
FetchContent_MakeAvailable(cutlass)
|
FetchContent_MakeAvailable(cutlass)
|
||||||
@ -239,11 +284,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
"csrc/mamba/causal_conv1d/causal_conv1d.cu"
|
||||||
"csrc/quantization/aqlm/gemm_kernels.cu"
|
"csrc/quantization/aqlm/gemm_kernels.cu"
|
||||||
"csrc/quantization/awq/gemm_kernels.cu"
|
"csrc/quantization/awq/gemm_kernels.cu"
|
||||||
"csrc/custom_all_reduce.cu"
|
|
||||||
"csrc/permute_cols.cu"
|
"csrc/permute_cols.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
||||||
|
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
||||||
|
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
||||||
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
||||||
"csrc/sparse/cutlass/sparse_compressor_entry.cu"
|
|
||||||
"csrc/cutlass_extensions/common.cpp")
|
"csrc/cutlass_extensions/common.cpp")
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
@ -253,7 +298,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# Only build Marlin kernels if we are building for at least some compatible archs.
|
# Only build Marlin kernels if we are building for at least some compatible archs.
|
||||||
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
||||||
# are not supported by Machete yet.
|
# are not supported by Machete yet.
|
||||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" ${CUDA_ARCHS})
|
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
|
||||||
if (MARLIN_ARCHS)
|
if (MARLIN_ARCHS)
|
||||||
set(MARLIN_SRCS
|
set(MARLIN_SRCS
|
||||||
"csrc/quantization/fp8/fp8_marlin.cu"
|
"csrc/quantization/fp8/fp8_marlin.cu"
|
||||||
@ -273,38 +318,87 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
" in CUDA target architectures")
|
" in CUDA target architectures")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
# Only build AllSpark kernels if we are building for at least some compatible archs.
|
||||||
|
cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
|
||||||
|
if (ALLSPARK_ARCHS)
|
||||||
|
set(ALLSPARK_SRCS
|
||||||
|
"csrc/quantization/gptq_allspark/allspark_repack.cu"
|
||||||
|
"csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${ALLSPARK_SRCS}"
|
||||||
|
CUDA_ARCHS "${ALLSPARK_ARCHS}")
|
||||||
|
list(APPEND VLLM_EXT_SRC "${ALLSPARK_SRCS}")
|
||||||
|
message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building AllSpark kernels as no compatible archs found"
|
||||||
|
" in CUDA target architectures")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
|
||||||
|
set(SCALED_MM_3X_ARCHS)
|
||||||
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
|
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
|
||||||
# CUDA 12.0 or later (and only work on Hopper, 9.0/9.0a for now).
|
# CUDA 12.0 or later
|
||||||
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0;9.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu")
|
set(SRCS
|
||||||
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
|
||||||
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
|
||||||
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
|
||||||
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
|
||||||
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM90=1")
|
||||||
message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
|
# Let scaled_mm_c2x know it doesn't need to build these arches
|
||||||
|
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
|
message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}")
|
||||||
else()
|
else()
|
||||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
|
||||||
message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
|
message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
|
||||||
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
|
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
|
||||||
"later if you intend on running FP8 quantized models on "
|
"later if you intend on running FP8 quantized models on "
|
||||||
"Hopper.")
|
"Hopper.")
|
||||||
else()
|
else()
|
||||||
message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
|
message(STATUS "Not building scaled_mm_c3x_sm90 as no compatible archs found "
|
||||||
"in CUDA target architectures")
|
"in CUDA target architectures")
|
||||||
endif()
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
# clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
|
# The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
|
||||||
# build any 3x kernels
|
# CUDA 12.8 or later
|
||||||
set(SCALED_MM_3X_ARCHS)
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
|
||||||
|
set(SRCS
|
||||||
|
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
||||||
|
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
|
||||||
|
)
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${SRCS}"
|
||||||
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM100=1")
|
||||||
|
# Let scaled_mm_c2x know it doesn't need to build these arches
|
||||||
|
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
|
message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}")
|
||||||
|
else()
|
||||||
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
|
||||||
|
message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
|
||||||
|
"not >= 12.8, we recommend upgrading to CUDA 12.8 or "
|
||||||
|
"later if you intend on running FP8 quantized models on "
|
||||||
|
"Blackwell.")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building scaled_mm_c3x_100 as no compatible archs found "
|
||||||
|
"in CUDA target architectures")
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
|
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
|
||||||
# kernels for the remaining archs that are not already built for 3x.
|
# kernels for the remaining archs that are not already built for 3x.
|
||||||
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
|
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
|
||||||
"7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
|
"7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
|
||||||
# subtract out the archs that are already built for 3x
|
# subtract out the archs that are already built for 3x
|
||||||
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
||||||
if (SCALED_MM_2X_ARCHS)
|
if (SCALED_MM_2X_ARCHS)
|
||||||
@ -329,18 +423,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# 2:4 Sparse Kernels
|
# 2:4 Sparse Kernels
|
||||||
|
|
||||||
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
|
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
|
||||||
# require CUDA 12.2 or later (and only work on Hopper, 9.0/9.0a for now).
|
# require CUDA 12.2 or later (and only work on Hopper).
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
||||||
set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
|
||||||
"csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
|
set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
|
||||||
message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
|
message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}")
|
||||||
else()
|
else()
|
||||||
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
|
||||||
message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
|
message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
|
||||||
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
|
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
|
||||||
"if you intend on running FP8 sparse quantized models on Hopper.")
|
"if you intend on running FP8 sparse quantized models on Hopper.")
|
||||||
@ -350,6 +444,50 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
# FP4 Archs and flags
|
||||||
|
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
|
||||||
|
set(SRCS
|
||||||
|
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
|
||||||
|
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu")
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${SRCS}"
|
||||||
|
CUDA_ARCHS "${FP4_ARCHS}")
|
||||||
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
|
||||||
|
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building NVFP4 as no compatible archs were found.")
|
||||||
|
# clear FP4_ARCHS
|
||||||
|
set(FP4_ARCHS)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
#
|
||||||
|
# CUTLASS MoE kernels
|
||||||
|
|
||||||
|
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
|
||||||
|
# on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
|
||||||
|
# to compile MoE kernels that use its output.
|
||||||
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
||||||
|
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
|
||||||
|
"csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${SRCS}"
|
||||||
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
|
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||||
|
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM90=1")
|
||||||
|
message(STATUS "Building grouped_mm_c3x for archs: ${SCALED_MM_ARCHS}")
|
||||||
|
else()
|
||||||
|
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
||||||
|
message(STATUS "Not building grouped_mm_c3x kernels as CUDA Compiler version is "
|
||||||
|
"not >= 12.3, we recommend upgrading to CUDA 12.3 or later "
|
||||||
|
"if you intend on running FP8 quantized MoE models on Hopper.")
|
||||||
|
else()
|
||||||
|
message(STATUS "Not building grouped_mm_c3x as no compatible archs found "
|
||||||
|
"in CUDA target architectures")
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
|
|
||||||
#
|
#
|
||||||
# Machete kernels
|
# Machete kernels
|
||||||
@ -431,7 +569,8 @@ define_gpu_extension_target(
|
|||||||
SOURCES ${VLLM_EXT_SRC}
|
SOURCES ${VLLM_EXT_SRC}
|
||||||
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
||||||
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
||||||
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
|
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
|
||||||
|
INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
|
||||||
USE_SABI 3
|
USE_SABI 3
|
||||||
WITH_SOABI)
|
WITH_SOABI)
|
||||||
|
|
||||||
@ -450,12 +589,24 @@ set(VLLM_MOE_EXT_SRC
|
|||||||
"csrc/moe/moe_align_sum_kernels.cu"
|
"csrc/moe/moe_align_sum_kernels.cu"
|
||||||
"csrc/moe/topk_softmax_kernels.cu")
|
"csrc/moe/topk_softmax_kernels.cu")
|
||||||
|
|
||||||
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
|
list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
|
||||||
|
endif()
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${VLLM_MOE_EXT_SRC}"
|
SRCS "${VLLM_MOE_EXT_SRC}"
|
||||||
CUDA_ARCHS "${CUDA_ARCHS}")
|
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
|
set(VLLM_MOE_WNA16_SRC
|
||||||
|
"csrc/moe/moe_wna16.cu")
|
||||||
|
|
||||||
|
set_gencode_flags_for_srcs(
|
||||||
|
SRCS "${VLLM_MOE_WNA16_SRC}"
|
||||||
|
CUDA_ARCHS "${CUDA_ARCHS}")
|
||||||
|
|
||||||
|
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
|
||||||
|
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
|
||||||
if (MARLIN_MOE_ARCHS)
|
if (MARLIN_MOE_ARCHS)
|
||||||
set(MARLIN_MOE_SRC
|
set(MARLIN_MOE_SRC
|
||||||
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
|
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
|
||||||
@ -509,79 +660,8 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
|
|||||||
WITH_SOABI)
|
WITH_SOABI)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# vllm-flash-attn currently only supported on CUDA
|
# For CUDA we also build and ship some external projects.
|
||||||
if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda")
|
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
return()
|
include(cmake/external_projects/flashmla.cmake)
|
||||||
|
include(cmake/external_projects/vllm_flash_attn.cmake)
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
|
|
||||||
# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
|
|
||||||
# arches in the CUDA case (and instead set the gencodes on a per file basis)
|
|
||||||
# we need to manually set VLLM_GPU_ARCHES here.
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|
||||||
foreach(_ARCH ${CUDA_ARCHS})
|
|
||||||
string(REPLACE "." "" _ARCH "${_ARCH}")
|
|
||||||
list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
|
|
||||||
endforeach()
|
|
||||||
endif()
|
|
||||||
|
|
||||||
#
|
|
||||||
# Build vLLM flash attention from source
|
|
||||||
#
|
|
||||||
# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
|
|
||||||
# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
|
|
||||||
# They should be identical but if they aren't, this is a massive footgun.
|
|
||||||
#
|
|
||||||
# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
|
|
||||||
# To only install vllm-flash-attn, use --component vllm_flash_attn_c.
|
|
||||||
# If no component is specified, vllm-flash-attn is still installed.
|
|
||||||
|
|
||||||
# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
|
|
||||||
# This is to enable local development of vllm-flash-attn within vLLM.
|
|
||||||
# It can be set as an environment variable or passed as a cmake argument.
|
|
||||||
# The environment variable takes precedence.
|
|
||||||
if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
|
|
||||||
set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if(VLLM_FLASH_ATTN_SRC_DIR)
|
|
||||||
FetchContent_Declare(vllm-flash-attn SOURCE_DIR ${VLLM_FLASH_ATTN_SRC_DIR})
|
|
||||||
else()
|
|
||||||
FetchContent_Declare(
|
|
||||||
vllm-flash-attn
|
|
||||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
|
||||||
GIT_TAG 04325b6798bcc326c86fb35af62d05a9c8c8eceb
|
|
||||||
GIT_PROGRESS TRUE
|
|
||||||
# Don't share the vllm-flash-attn build between build types
|
|
||||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
|
||||||
)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# Set the parent build flag so that the vllm-flash-attn library does not redo compile flag and arch initialization.
|
|
||||||
set(VLLM_PARENT_BUILD ON)
|
|
||||||
|
|
||||||
# Ensure the vllm/vllm_flash_attn directory exists before installation
|
|
||||||
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" COMPONENT vllm_flash_attn_c)
|
|
||||||
|
|
||||||
# Make sure vllm-flash-attn install rules are nested under vllm/
|
|
||||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" COMPONENT vllm_flash_attn_c)
|
|
||||||
install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
|
|
||||||
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" COMPONENT vllm_flash_attn_c)
|
|
||||||
|
|
||||||
# Fetch the vllm-flash-attn library
|
|
||||||
FetchContent_MakeAvailable(vllm-flash-attn)
|
|
||||||
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
|
|
||||||
|
|
||||||
# Restore the install prefix
|
|
||||||
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" COMPONENT vllm_flash_attn_c)
|
|
||||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" COMPONENT vllm_flash_attn_c)
|
|
||||||
|
|
||||||
# Copy over the vllm-flash-attn python files
|
|
||||||
install(
|
|
||||||
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
|
||||||
DESTINATION vllm/vllm_flash_attn
|
|
||||||
COMPONENT vllm_flash_attn_c
|
|
||||||
FILES_MATCHING PATTERN "*.py"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Nothing after vllm-flash-attn, see comment about macros above
|
|
||||||
|
|||||||
@ -61,7 +61,7 @@ representative at an online or offline/IRL event.
|
|||||||
|
|
||||||
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
||||||
reported to the community leaders responsible for enforcement in the #code-of-conduct
|
reported to the community leaders responsible for enforcement in the #code-of-conduct
|
||||||
channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g).
|
channel in the [vLLM Slack](https://slack.vllm.ai).
|
||||||
All complaints will be reviewed and investigated promptly and fairly.
|
All complaints will be reviewed and investigated promptly and fairly.
|
||||||
|
|
||||||
All community leaders are obligated to respect the privacy and security of the
|
All community leaders are obligated to respect the privacy and security of the
|
||||||
@ -125,4 +125,3 @@ Community Impact Guidelines were inspired by
|
|||||||
For answers to common questions about this code of conduct, see the
|
For answers to common questions about this code of conduct, see the
|
||||||
[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
|
[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
|
||||||
[Contributor Covenant translations](https://www.contributor-covenant.org/translations).
|
[Contributor Covenant translations](https://www.contributor-covenant.org/translations).
|
||||||
|
|
||||||
|
|||||||
@ -1,69 +0,0 @@
|
|||||||
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
|
|
||||||
|
|
||||||
FROM ubuntu:22.04 AS cpu-test-1
|
|
||||||
|
|
||||||
ENV CCACHE_DIR=/root/.cache/ccache
|
|
||||||
|
|
||||||
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/var/cache/apt \
|
|
||||||
apt-get update -y \
|
|
||||||
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
|
|
||||||
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
|
|
||||||
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
|
||||||
|
|
||||||
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
|
|
||||||
# intel-openmp provides additional performance improvement vs. openmp
|
|
||||||
# tcmalloc provides better memory allocation efficiency, e.g, holding memory in caches to speed up access of commonly-used objects.
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install intel-openmp==2025.0.1
|
|
||||||
|
|
||||||
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/lib/libiomp5.so"
|
|
||||||
|
|
||||||
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
|
||||||
|
|
||||||
RUN pip install intel_extension_for_pytorch==2.5.0
|
|
||||||
|
|
||||||
WORKDIR /workspace
|
|
||||||
|
|
||||||
COPY requirements-build.txt requirements-build.txt
|
|
||||||
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
|
||||||
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install --upgrade pip && \
|
|
||||||
pip install -r requirements-build.txt
|
|
||||||
|
|
||||||
FROM cpu-test-1 AS build
|
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
|
||||||
|
|
||||||
COPY requirements-common.txt requirements-common.txt
|
|
||||||
COPY requirements-cpu.txt requirements-cpu.txt
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install -v -r requirements-cpu.txt
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
ARG GIT_REPO_CHECK=0
|
|
||||||
RUN --mount=type=bind,source=.git,target=.git \
|
|
||||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
|
||||||
|
|
||||||
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
|
|
||||||
ARG VLLM_CPU_DISABLE_AVX512
|
|
||||||
ENV VLLM_CPU_DISABLE_AVX512=${VLLM_CPU_DISABLE_AVX512}
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
--mount=type=cache,target=/root/.cache/ccache \
|
|
||||||
--mount=type=bind,source=.git,target=.git \
|
|
||||||
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel && \
|
|
||||||
pip install dist/*.whl && \
|
|
||||||
rm -rf dist
|
|
||||||
|
|
||||||
WORKDIR /workspace/
|
|
||||||
|
|
||||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install -e tests/vllm_test_utils
|
|
||||||
|
|
||||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
|
||||||
@ -1,28 +0,0 @@
|
|||||||
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
|
|
||||||
# to run the OpenAI compatible server.
|
|
||||||
|
|
||||||
FROM ubuntu:22.04 AS dev
|
|
||||||
|
|
||||||
RUN apt-get update -y && \
|
|
||||||
apt-get install -y \
|
|
||||||
git python3-pip \
|
|
||||||
ffmpeg libsm6 libxext6 libgl1
|
|
||||||
WORKDIR /workspace
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
ARG GIT_REPO_CHECK=0
|
|
||||||
RUN --mount=type=bind,source=.git,target=.git \
|
|
||||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
|
||||||
|
|
||||||
# install build requirements
|
|
||||||
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt
|
|
||||||
# build vLLM with OpenVINO backend
|
|
||||||
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace
|
|
||||||
|
|
||||||
COPY examples/ /workspace/examples
|
|
||||||
COPY benchmarks/ /workspace/benchmarks
|
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
|
||||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
|
||||||
@ -1,39 +0,0 @@
|
|||||||
FROM mambaorg/micromamba
|
|
||||||
ARG MAMBA_DOCKERFILE_ACTIVATE=1
|
|
||||||
USER root
|
|
||||||
|
|
||||||
ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
|
|
||||||
|
|
||||||
RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1
|
|
||||||
|
|
||||||
# Some packages in requirements-cpu are installed here
|
|
||||||
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
|
|
||||||
# Currently these may not be available for venv or pip directly
|
|
||||||
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 torchvision-cpu=0.16.2 rust && micromamba clean --all --yes
|
|
||||||
|
|
||||||
COPY ./ /workspace/vllm
|
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
|
||||||
ARG GIT_REPO_CHECK=0
|
|
||||||
RUN --mount=type=bind,source=.git,target=.git \
|
|
||||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
|
|
||||||
|
|
||||||
# These packages will be in rocketce eventually
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
|
|
||||||
'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
|
|
||||||
torch==2.3.1 \
|
|
||||||
-r requirements-cpu.txt \
|
|
||||||
xformers uvloop==0.20.0
|
|
||||||
|
|
||||||
RUN --mount=type=bind,source=.git,target=.git \
|
|
||||||
VLLM_TARGET_DEVICE=cpu python3 setup.py install
|
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
|
||||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
|
||||||
|
|
||||||
WORKDIR /workspace/
|
|
||||||
|
|
||||||
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
|
||||||
|
|
||||||
ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
|
|
||||||
174
Dockerfile.rocm
174
Dockerfile.rocm
@ -1,174 +0,0 @@
|
|||||||
# Default ROCm 6.2 base image
|
|
||||||
ARG BASE_IMAGE="rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0"
|
|
||||||
|
|
||||||
# Default ROCm ARCHes to build vLLM for.
|
|
||||||
ARG PYTORCH_ROCM_ARCH="gfx908;gfx90a;gfx942;gfx1100"
|
|
||||||
|
|
||||||
# Whether to install CK-based flash-attention
|
|
||||||
# If 0, will not install flash-attention
|
|
||||||
ARG BUILD_FA="1"
|
|
||||||
ARG FA_GFX_ARCHS="gfx90a;gfx942"
|
|
||||||
ARG FA_BRANCH="3cea2fb"
|
|
||||||
|
|
||||||
# Whether to build triton on rocm
|
|
||||||
ARG BUILD_TRITON="1"
|
|
||||||
ARG TRITON_BRANCH="e192dba"
|
|
||||||
|
|
||||||
### Base image build stage
|
|
||||||
FROM $BASE_IMAGE AS base
|
|
||||||
|
|
||||||
# Import arg(s) defined before this build stage
|
|
||||||
ARG PYTORCH_ROCM_ARCH
|
|
||||||
|
|
||||||
# Install some basic utilities
|
|
||||||
RUN apt-get update && apt-get install python3 python3-pip -y
|
|
||||||
RUN apt-get update && apt-get install -y \
|
|
||||||
curl \
|
|
||||||
ca-certificates \
|
|
||||||
sudo \
|
|
||||||
git \
|
|
||||||
bzip2 \
|
|
||||||
libx11-6 \
|
|
||||||
build-essential \
|
|
||||||
wget \
|
|
||||||
unzip \
|
|
||||||
tmux \
|
|
||||||
ccache \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# When launching the container, mount the code directory to /vllm-workspace
|
|
||||||
ARG APP_MOUNT=/vllm-workspace
|
|
||||||
WORKDIR ${APP_MOUNT}
|
|
||||||
|
|
||||||
RUN python3 -m pip install --upgrade pip
|
|
||||||
# Remove sccache so it doesn't interfere with ccache
|
|
||||||
# TODO: implement sccache support across components
|
|
||||||
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
|
|
||||||
|
|
||||||
# Install torch == 2.6.0 on ROCm
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
case "$(ls /opt | grep -Po 'rocm-[0-9]\.[0-9]')" in \
|
|
||||||
*"rocm-6.2"*) \
|
|
||||||
python3 -m pip uninstall -y torch torchvision \
|
|
||||||
&& python3 -m pip install --pre \
|
|
||||||
torch==2.6.0.dev20241113+rocm6.2 \
|
|
||||||
'setuptools-scm>=8' \
|
|
||||||
torchvision==0.20.0.dev20241113+rocm6.2 \
|
|
||||||
--extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2;; \
|
|
||||||
*) ;; esac
|
|
||||||
|
|
||||||
ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
|
|
||||||
ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
|
|
||||||
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
|
|
||||||
ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
|
|
||||||
|
|
||||||
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
|
|
||||||
ENV CCACHE_DIR=/root/.cache/ccache
|
|
||||||
|
|
||||||
|
|
||||||
### AMD-SMI build stage
|
|
||||||
FROM base AS build_amdsmi
|
|
||||||
# Build amdsmi wheel always
|
|
||||||
RUN cd /opt/rocm/share/amd_smi \
|
|
||||||
&& python3 -m pip wheel . --wheel-dir=/install
|
|
||||||
|
|
||||||
|
|
||||||
### Flash-Attention wheel build stage
|
|
||||||
FROM base AS build_fa
|
|
||||||
ARG BUILD_FA
|
|
||||||
ARG FA_GFX_ARCHS
|
|
||||||
ARG FA_BRANCH
|
|
||||||
# Build ROCm flash-attention wheel if `BUILD_FA = 1`
|
|
||||||
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
|
||||||
if [ "$BUILD_FA" = "1" ]; then \
|
|
||||||
mkdir -p libs \
|
|
||||||
&& cd libs \
|
|
||||||
&& git clone https://github.com/ROCm/flash-attention.git \
|
|
||||||
&& cd flash-attention \
|
|
||||||
&& git checkout "${FA_BRANCH}" \
|
|
||||||
&& git submodule update --init \
|
|
||||||
&& GPU_ARCHS="${FA_GFX_ARCHS}" python3 setup.py bdist_wheel --dist-dir=/install; \
|
|
||||||
# Create an empty directory otherwise as later build stages expect one
|
|
||||||
else mkdir -p /install; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
||||||
### Triton wheel build stage
|
|
||||||
FROM base AS build_triton
|
|
||||||
ARG BUILD_TRITON
|
|
||||||
ARG TRITON_BRANCH
|
|
||||||
# Build triton wheel if `BUILD_TRITON = 1`
|
|
||||||
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
|
||||||
if [ "$BUILD_TRITON" = "1" ]; then \
|
|
||||||
mkdir -p libs \
|
|
||||||
&& cd libs \
|
|
||||||
&& python3 -m pip install ninja cmake wheel pybind11 \
|
|
||||||
&& git clone https://github.com/OpenAI/triton.git \
|
|
||||||
&& cd triton \
|
|
||||||
&& git checkout "${TRITON_BRANCH}" \
|
|
||||||
&& cd python \
|
|
||||||
&& python3 setup.py bdist_wheel --dist-dir=/install; \
|
|
||||||
# Create an empty directory otherwise as later build stages expect one
|
|
||||||
else mkdir -p /install; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
||||||
### Final vLLM build stage
|
|
||||||
FROM base AS final
|
|
||||||
# Import the vLLM development directory from the build context
|
|
||||||
COPY . .
|
|
||||||
ARG GIT_REPO_CHECK=0
|
|
||||||
RUN --mount=type=bind,source=.git,target=.git \
|
|
||||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
|
||||||
|
|
||||||
RUN python3 -m pip install --upgrade pip
|
|
||||||
|
|
||||||
# Package upgrades for useful functionality or to avoid dependency issues
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
python3 -m pip install --upgrade numba scipy huggingface-hub[cli] pytest-shard
|
|
||||||
|
|
||||||
|
|
||||||
# Workaround for ray >= 2.10.0
|
|
||||||
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
|
||||||
# Silences the HF Tokenizers warning
|
|
||||||
ENV TOKENIZERS_PARALLELISM=false
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=${CCACHE_DIR} \
|
|
||||||
--mount=type=bind,source=.git,target=.git \
|
|
||||||
--mount=type=cache,target=/root/.cache/pip \
|
|
||||||
python3 -m pip install -Ur requirements-rocm.txt \
|
|
||||||
&& python3 setup.py clean --all \
|
|
||||||
&& python3 setup.py develop
|
|
||||||
|
|
||||||
# Copy amdsmi wheel into final image
|
|
||||||
RUN --mount=type=bind,from=build_amdsmi,src=/install,target=/install \
|
|
||||||
mkdir -p libs \
|
|
||||||
&& cp /install/*.whl libs \
|
|
||||||
# Preemptively uninstall to avoid same-version no-installs
|
|
||||||
&& python3 -m pip uninstall -y amdsmi;
|
|
||||||
|
|
||||||
# Copy triton wheel(s) into final image if they were built
|
|
||||||
RUN --mount=type=bind,from=build_triton,src=/install,target=/install \
|
|
||||||
mkdir -p libs \
|
|
||||||
&& if ls /install/*.whl; then \
|
|
||||||
cp /install/*.whl libs \
|
|
||||||
# Preemptively uninstall to avoid same-version no-installs
|
|
||||||
&& python3 -m pip uninstall -y triton; fi
|
|
||||||
|
|
||||||
# Copy flash-attn wheel(s) into final image if they were built
|
|
||||||
RUN --mount=type=bind,from=build_fa,src=/install,target=/install \
|
|
||||||
mkdir -p libs \
|
|
||||||
&& if ls /install/*.whl; then \
|
|
||||||
cp /install/*.whl libs \
|
|
||||||
# Preemptively uninstall to avoid same-version no-installs
|
|
||||||
&& python3 -m pip uninstall -y flash-attn; fi
|
|
||||||
|
|
||||||
# Install wheels that were built to the final image
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
if ls libs/*.whl; then \
|
|
||||||
python3 -m pip install libs/*.whl; fi
|
|
||||||
|
|
||||||
# install development dependencies (for testing)
|
|
||||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
|
||||||
@ -1,69 +0,0 @@
|
|||||||
FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS vllm-base
|
|
||||||
|
|
||||||
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
|
|
||||||
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
|
|
||||||
chmod 644 /usr/share/keyrings/intel-oneapi-archive-keyring.gpg && \
|
|
||||||
wget -O- https://repositories.intel.com/graphics/intel-graphics.key | gpg --dearmor | tee /usr/share/keyrings/intel-graphics.gpg > /dev/null && \
|
|
||||||
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
|
|
||||||
chmod 644 /usr/share/keyrings/intel-graphics.gpg
|
|
||||||
|
|
||||||
RUN apt-get update -y && \
|
|
||||||
apt-get install -y --no-install-recommends --fix-missing \
|
|
||||||
curl \
|
|
||||||
ffmpeg \
|
|
||||||
git \
|
|
||||||
libsndfile1 \
|
|
||||||
libsm6 \
|
|
||||||
libxext6 \
|
|
||||||
libgl1 \
|
|
||||||
lsb-release \
|
|
||||||
numactl \
|
|
||||||
python3 \
|
|
||||||
python3-dev \
|
|
||||||
python3-pip \
|
|
||||||
# vim \
|
|
||||||
wget
|
|
||||||
|
|
||||||
WORKDIR /workspace/vllm
|
|
||||||
COPY requirements-xpu.txt /workspace/vllm/requirements-xpu.txt
|
|
||||||
COPY requirements-common.txt /workspace/vllm/requirements-common.txt
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install --no-cache-dir \
|
|
||||||
-r requirements-xpu.txt
|
|
||||||
|
|
||||||
RUN git clone https://github.com/intel/pti-gpu && \
|
|
||||||
cd pti-gpu/sdk && \
|
|
||||||
git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \
|
|
||||||
mkdir build && \
|
|
||||||
cd build && \
|
|
||||||
cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \
|
|
||||||
make -j && \
|
|
||||||
cmake --install . --config Release --prefix "/usr/local"
|
|
||||||
|
|
||||||
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
|
|
||||||
|
|
||||||
COPY . .
|
|
||||||
ARG GIT_REPO_CHECK
|
|
||||||
RUN --mount=type=bind,source=.git,target=.git \
|
|
||||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
|
|
||||||
|
|
||||||
ENV VLLM_TARGET_DEVICE=xpu
|
|
||||||
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
--mount=type=bind,source=.git,target=.git \
|
|
||||||
python3 setup.py install
|
|
||||||
|
|
||||||
CMD ["/bin/bash"]
|
|
||||||
|
|
||||||
FROM vllm-base AS vllm-openai
|
|
||||||
|
|
||||||
# install additional dependencies for openai api server
|
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
|
||||||
pip install accelerate hf_transfer 'modelscope!=1.15.0'
|
|
||||||
|
|
||||||
ENV VLLM_USAGE_SOURCE production-docker-image \
|
|
||||||
TRITON_XPU_PROFILE 1
|
|
||||||
# install development dependencies (for testing)
|
|
||||||
RUN python3 -m pip install -e tests/vllm_test_utils
|
|
||||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
|
||||||
10
MANIFEST.in
10
MANIFEST.in
@ -1,9 +1,9 @@
|
|||||||
include LICENSE
|
include LICENSE
|
||||||
include requirements-common.txt
|
include requirements/common.txt
|
||||||
include requirements-cuda.txt
|
include requirements/cuda.txt
|
||||||
include requirements-rocm.txt
|
include requirements/rocm.txt
|
||||||
include requirements-neuron.txt
|
include requirements/neuron.txt
|
||||||
include requirements-cpu.txt
|
include requirements/cpu.txt
|
||||||
include CMakeLists.txt
|
include CMakeLists.txt
|
||||||
|
|
||||||
recursive-include cmake *
|
recursive-include cmake *
|
||||||
|
|||||||
63
README.md
63
README.md
@ -10,13 +10,27 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
</h3>
|
</h3>
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://discuss.vllm.ai"><b>User Forum</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
[2025/04] We're hosting our first-ever *vLLM Asia Developer Day* in Singapore on *April 3rd*! This is a full-day event (9 AM - 9 PM SGT) in partnership with SGInnovate, AMD, and Embedded LLM. Meet the vLLM team and learn about LLM inference for RL, MI300X, and more! [Register Now](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
|
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
|
||||||
|
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
|
||||||
|
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
|
||||||
|
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
|
||||||
|
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
|
||||||
|
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
|
||||||
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
|
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<summary>Previous News</summary>
|
||||||
|
|
||||||
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
|
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
|
||||||
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
|
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
|
||||||
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
|
- [2024/10] Ray Summit 2024 held a special track for vLLM! Please find the opening talk slides from the vLLM team [here](https://docs.google.com/presentation/d/1B_KQxpHBTRa_mDF-tR6i8rWdOU5QoTZNcEg2MKZxEHM/edit?usp=sharing). Learn more from the [talks](https://www.youtube.com/playlist?list=PLzTswPQNepXl6AQwifuwUImLPFRVpksjR) from other vLLM contributors and users!
|
||||||
@ -30,14 +44,19 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
|
- [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
|
||||||
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
|
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
---
|
---
|
||||||
## About
|
## About
|
||||||
|
|
||||||
vLLM is a fast and easy-to-use library for LLM inference and serving.
|
vLLM is a fast and easy-to-use library for LLM inference and serving.
|
||||||
|
|
||||||
|
Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
|
||||||
|
|
||||||
vLLM is fast with:
|
vLLM is fast with:
|
||||||
|
|
||||||
- State-of-the-art serving throughput
|
- State-of-the-art serving throughput
|
||||||
- Efficient management of attention key and value memory with **PagedAttention**
|
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
|
||||||
- Continuous batching of incoming requests
|
- Continuous batching of incoming requests
|
||||||
- Fast model execution with CUDA/HIP graph
|
- Fast model execution with CUDA/HIP graph
|
||||||
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
|
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
|
||||||
@ -60,7 +79,7 @@ vLLM is flexible and easy to use with:
|
|||||||
|
|
||||||
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
|
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
|
||||||
- Transformer-like LLMs (e.g., Llama)
|
- Transformer-like LLMs (e.g., Llama)
|
||||||
- Mixture-of-Expert LLMs (e.g., Mixtral)
|
- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
|
||||||
- Embedding Models (e.g. E5-Mistral)
|
- Embedding Models (e.g. E5-Mistral)
|
||||||
- Multi-modal LLMs (e.g., LLaVA)
|
- Multi-modal LLMs (e.g., LLaVA)
|
||||||
|
|
||||||
@ -68,16 +87,16 @@ Find the full list of supported models [here](https://docs.vllm.ai/en/latest/mod
|
|||||||
|
|
||||||
## Getting Started
|
## Getting Started
|
||||||
|
|
||||||
Install vLLM with `pip` or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
|
Install vLLM with `pip` or [from source](https://docs.vllm.ai/en/latest/getting_started/installation/gpu/index.html#build-wheel-from-source):
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install vllm
|
pip install vllm
|
||||||
```
|
```
|
||||||
|
|
||||||
Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
|
Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
|
||||||
- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
|
- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation.html)
|
||||||
- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
|
- [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
|
||||||
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
|
- [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
|
|
||||||
@ -90,34 +109,40 @@ vLLM is a community project. Our compute resources for development and testing a
|
|||||||
|
|
||||||
<!-- Note: Please sort them in alphabetical order. -->
|
<!-- Note: Please sort them in alphabetical order. -->
|
||||||
<!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
|
<!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
|
||||||
|
Cash Donations:
|
||||||
- a16z
|
- a16z
|
||||||
|
- Dropbox
|
||||||
|
- Sequoia Capital
|
||||||
|
- Skywork AI
|
||||||
|
- ZhenFund
|
||||||
|
|
||||||
|
Compute Resources:
|
||||||
- AMD
|
- AMD
|
||||||
- Anyscale
|
- Anyscale
|
||||||
- AWS
|
- AWS
|
||||||
- Crusoe Cloud
|
- Crusoe Cloud
|
||||||
- Databricks
|
- Databricks
|
||||||
- DeepInfra
|
- DeepInfra
|
||||||
- Dropbox
|
|
||||||
- Google Cloud
|
- Google Cloud
|
||||||
- Lambda Lab
|
- Lambda Lab
|
||||||
- Nebius
|
- Nebius
|
||||||
|
- Novita AI
|
||||||
- NVIDIA
|
- NVIDIA
|
||||||
- Replicate
|
- Replicate
|
||||||
- Roblox
|
- Roblox
|
||||||
- RunPod
|
- RunPod
|
||||||
- Sequoia Capital
|
|
||||||
- Skywork AI
|
|
||||||
- Trainy
|
- Trainy
|
||||||
- UC Berkeley
|
- UC Berkeley
|
||||||
- UC San Diego
|
- UC San Diego
|
||||||
- ZhenFund
|
|
||||||
|
Slack Sponsor: Anyscale
|
||||||
|
|
||||||
We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
|
We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
|
||||||
|
|
||||||
## Citation
|
## Citation
|
||||||
|
|
||||||
If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
|
If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
|
||||||
|
|
||||||
```bibtex
|
```bibtex
|
||||||
@inproceedings{kwon2023efficient,
|
@inproceedings{kwon2023efficient,
|
||||||
title={Efficient Memory Management for Large Language Model Serving with PagedAttention},
|
title={Efficient Memory Management for Large Language Model Serving with PagedAttention},
|
||||||
@ -129,12 +154,12 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
|
|||||||
|
|
||||||
## Contact Us
|
## Contact Us
|
||||||
|
|
||||||
* For technical questions and feature requests, please use Github issues or discussions.
|
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
|
||||||
* For discussing with fellow users, please use Discord.
|
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
|
||||||
* For coordinating contributions and development, please use Slack.
|
- coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
|
||||||
* For security disclosures, please use Github's security advisory feature.
|
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
|
||||||
* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
|
- For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
|
||||||
|
|
||||||
## Media Kit
|
## Media Kit
|
||||||
|
|
||||||
* If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
|
- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
|
||||||
|
|||||||
54
RELEASE.md
Normal file
54
RELEASE.md
Normal file
@ -0,0 +1,54 @@
|
|||||||
|
# Releasing vLLM
|
||||||
|
|
||||||
|
vLLM releases offer a reliable version of the code base, packaged into a binary format that can be conveniently accessed via PyPI. These releases also serve as key milestones for the development team to communicate with the community about newly available features, improvements, and upcoming changes that could affect users, including potential breaking changes.
|
||||||
|
|
||||||
|
## Release Versioning
|
||||||
|
|
||||||
|
vLLM uses a “right-shifted” versioning scheme where a new patch release is out every 2 weeks. And patch releases contain features and bug fixes (as opposed to semver where patch release contains only backwards-compatible bug fixes). When critical fixes need to be made, special release post1 is released.
|
||||||
|
|
||||||
|
* _major_ major architectural milestone and when incompatible API changes are made, similar to PyTorch 2.0.
|
||||||
|
* _minor_ major features
|
||||||
|
* _patch_ features and backwards-compatible bug fixes
|
||||||
|
* _post1_ or _patch-1_ backwards-compatible bug fixes, either explicit or implicit post release
|
||||||
|
|
||||||
|
## Release Cadence
|
||||||
|
|
||||||
|
Patch release is released on bi-weekly basis. Post release 1-3 days after patch release and uses same branch as patch release.
|
||||||
|
Following is the release cadence for year 2025. All future release dates below are tentative. Please note: Post releases are optional.
|
||||||
|
|
||||||
|
| Release Date | Patch release versions | Post Release versions |
|
||||||
|
| --- | --- | --- |
|
||||||
|
| Jan 2025 | 0.7.0 | --- |
|
||||||
|
| Feb 2025 | 0.7.1, 0.7.2, 0.7.3 | --- |
|
||||||
|
| Mar 2025 | 0.7.4, 0.7.5 | --- |
|
||||||
|
| Apr 2025 | 0.7.6, 0.7.7 | --- |
|
||||||
|
| May 2025 | 0.7.8, 0.7.9 | --- |
|
||||||
|
| Jun 2025 | 0.7.10, 0.7.11 | --- |
|
||||||
|
| Jul 2025 | 0.7.12, 0.7.13 | --- |
|
||||||
|
| Aug 2025 | 0.7.14, 0.7.15 | --- |
|
||||||
|
| Sep 2025 | 0.7.16, 0.7.17 | --- |
|
||||||
|
| Oct 2025 | 0.7.18, 0.7.19 | --- |
|
||||||
|
| Nov 2025 | 0.7.20, 0.7.21 | --- |
|
||||||
|
| Dec 2025 | 0.7.22, 0.7.23 | --- |
|
||||||
|
|
||||||
|
## Release branch
|
||||||
|
|
||||||
|
Each release is built from a dedicated release branch.
|
||||||
|
|
||||||
|
* For _major_, _minor_, _patch_ releases, the release branch cut is performed 1-2 days before release is live.
|
||||||
|
* For post releases, previously cut release branch is reused
|
||||||
|
* Release builds are triggered via push to RC tag like vX.Y.Z-rc1 . This enables us to build and test multiple RCs for each release.
|
||||||
|
* Final tag : vX.Y.Z does not trigger the build but used for Release notes and assets.
|
||||||
|
* After branch cut is created we monitor the main branch for any reverts and apply these reverts to a release branch.
|
||||||
|
|
||||||
|
## Release Cherry-Pick Criteria
|
||||||
|
|
||||||
|
After branch cut, we approach finalizing the release branch with clear criteria on what cherry picks are allowed in. Note: a cherry pick is a process to land a PR in the release branch after branch cut. These are typically limited to ensure that the team has sufficient time to complete a thorough round of testing on a stable code base.
|
||||||
|
|
||||||
|
* Regression fixes - that address functional/performance regression against the most recent release (e.g. 0.7.0 for 0.7.1 release)
|
||||||
|
* Critical fixes - critical fixes for severe issue such as silent incorrectness, backwards compatibility, crashes, deadlocks, (large) memory leaks
|
||||||
|
* Fixes to new features introduced in the most recent release (e.g. 0.7.0 for 0.7.1 release)
|
||||||
|
* Documentation improvements
|
||||||
|
* Release branch specific changes (e.g. change version identifiers or CI fixes)
|
||||||
|
|
||||||
|
Please note: **No feature work allowed for cherry picks**. All PRs that are considered for cherry-picks need to be merged on trunk, the only exception are Release branch specific changes.
|
||||||
@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
|
If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
|
||||||
|
|
||||||
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
|
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html).
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|||||||
@ -1,19 +1,296 @@
|
|||||||
# Benchmarking vLLM
|
# Benchmarking vLLM
|
||||||
|
|
||||||
## Downloading the ShareGPT dataset
|
This README guides you through running benchmark tests with the extensive
|
||||||
|
datasets supported on vLLM. It’s a living document, updated as new features and datasets
|
||||||
|
become available.
|
||||||
|
|
||||||
|
## Dataset Overview
|
||||||
|
|
||||||
|
<table style="width:100%; border-collapse: collapse;">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th style="width:15%; text-align: left;">Dataset</th>
|
||||||
|
<th style="width:10%; text-align: center;">Online</th>
|
||||||
|
<th style="width:10%; text-align: center;">Offline</th>
|
||||||
|
<th style="width:65%; text-align: left;">Data Path</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td><strong>ShareGPT</strong></td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td><code>wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json</code></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><strong>BurstGPT</strong></td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td><code>wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv</code></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><strong>Sonnet</strong></td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td>Local file: <code>benchmarks/sonnet.txt</code></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><strong>Random</strong></td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td><code>synthetic</code></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><strong>HuggingFace-VisionArena</strong></td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td><code>lmarena-ai/VisionArena-Chat</code></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><strong>HuggingFace-InstructCoder</strong></td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td><code>likaixin/InstructCoder</code></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td><strong>HuggingFace-Other</strong></td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td style="text-align: center;">✅</td>
|
||||||
|
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
✅: supported
|
||||||
|
|
||||||
|
🟡: Partial support
|
||||||
|
|
||||||
|
🚧: to be supported
|
||||||
|
|
||||||
|
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
|
||||||
|
|
||||||
|
---
|
||||||
|
## Example - Online Benchmark
|
||||||
|
|
||||||
|
First start serving your model
|
||||||
|
|
||||||
You can download the dataset by running:
|
|
||||||
```bash
|
```bash
|
||||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
|
||||||
```
|
```
|
||||||
|
|
||||||
## Downloading the ShareGPT4V dataset
|
Then run the benchmarking script
|
||||||
|
|
||||||
The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts
|
|
||||||
will ignore a datapoint if the referred image is missing.
|
|
||||||
```bash
|
```bash
|
||||||
wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
|
# download dataset
|
||||||
mkdir coco -p
|
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
|
python3 vllm/benchmarks/benchmark_serving.py \
|
||||||
unzip coco/train2017.zip -d coco/
|
--backend vllm \
|
||||||
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
|
--endpoint /v1/completions \
|
||||||
|
--dataset-name sharegpt \
|
||||||
|
--dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||||
|
--num-prompts 10
|
||||||
```
|
```
|
||||||
|
|
||||||
|
If successful, you will see the following output
|
||||||
|
|
||||||
|
```
|
||||||
|
============ Serving Benchmark Result ============
|
||||||
|
Successful requests: 10
|
||||||
|
Benchmark duration (s): 5.78
|
||||||
|
Total input tokens: 1369
|
||||||
|
Total generated tokens: 2212
|
||||||
|
Request throughput (req/s): 1.73
|
||||||
|
Output token throughput (tok/s): 382.89
|
||||||
|
Total Token throughput (tok/s): 619.85
|
||||||
|
---------------Time to First Token----------------
|
||||||
|
Mean TTFT (ms): 71.54
|
||||||
|
Median TTFT (ms): 73.88
|
||||||
|
P99 TTFT (ms): 79.49
|
||||||
|
-----Time per Output Token (excl. 1st token)------
|
||||||
|
Mean TPOT (ms): 7.91
|
||||||
|
Median TPOT (ms): 7.96
|
||||||
|
P99 TPOT (ms): 8.03
|
||||||
|
---------------Inter-token Latency----------------
|
||||||
|
Mean ITL (ms): 7.74
|
||||||
|
Median ITL (ms): 7.70
|
||||||
|
P99 ITL (ms): 8.39
|
||||||
|
==================================================
|
||||||
|
```
|
||||||
|
|
||||||
|
### VisionArena Benchmark for Vision Language Models
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# need a model with vision capability here
|
||||||
|
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 vllm/benchmarks/benchmark_serving.py \
|
||||||
|
--backend openai-chat \
|
||||||
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
|
--endpoint /v1/chat/completions \
|
||||||
|
--dataset-name hf \
|
||||||
|
--dataset-path lmarena-ai/VisionArena-Chat \
|
||||||
|
--hf-split train \
|
||||||
|
--num-prompts 1000
|
||||||
|
```
|
||||||
|
|
||||||
|
### InstructCoder Benchmark with Speculative Decoding
|
||||||
|
|
||||||
|
``` bash
|
||||||
|
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
||||||
|
--speculative-model "[ngram]" \
|
||||||
|
--ngram_prompt_lookup_min 2 \
|
||||||
|
--ngram-prompt-lookup-max 5 \
|
||||||
|
--num_speculative_tokens 5
|
||||||
|
```
|
||||||
|
|
||||||
|
``` bash
|
||||||
|
python3 benchmarks/benchmark_serving.py \
|
||||||
|
--model meta-llama/Meta-Llama-3-8B-Instruct \
|
||||||
|
--dataset-name hf \
|
||||||
|
--dataset-path likaixin/InstructCoder \
|
||||||
|
--num-prompts 2048
|
||||||
|
```
|
||||||
|
|
||||||
|
### Other HuggingFaceDataset Examples
|
||||||
|
|
||||||
|
```bash
|
||||||
|
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
||||||
|
```
|
||||||
|
|
||||||
|
**`lmms-lab/LLaVA-OneVision-Data`**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 vllm/benchmarks/benchmark_serving.py \
|
||||||
|
--backend openai-chat \
|
||||||
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
|
--endpoint /v1/chat/completions \
|
||||||
|
--dataset-name hf \
|
||||||
|
--dataset-path lmms-lab/LLaVA-OneVision-Data \
|
||||||
|
--hf-split train \
|
||||||
|
--hf-subset "chart2text(cauldron)" \
|
||||||
|
--num-prompts 10
|
||||||
|
```
|
||||||
|
|
||||||
|
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 vllm/benchmarks/benchmark_serving.py \
|
||||||
|
--backend openai-chat \
|
||||||
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
|
--endpoint /v1/chat/completions \
|
||||||
|
--dataset-name hf \
|
||||||
|
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
|
||||||
|
--hf-split train \
|
||||||
|
--num-prompts 10
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
## Example - Offline Throughput Benchmark
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||||
|
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||||
|
--dataset-name sonnet \
|
||||||
|
--dataset-path vllm/benchmarks/sonnet.txt \
|
||||||
|
--num-prompts 10
|
||||||
|
```
|
||||||
|
|
||||||
|
If successful, you will see the following output
|
||||||
|
|
||||||
|
```
|
||||||
|
Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
|
||||||
|
Total num prompt tokens: 5014
|
||||||
|
Total num output tokens: 1500
|
||||||
|
```
|
||||||
|
|
||||||
|
### VisionArena Benchmark for Vision Language Models
|
||||||
|
|
||||||
|
``` bash
|
||||||
|
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||||
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
|
--backend vllm-chat \
|
||||||
|
--dataset-name hf \
|
||||||
|
--dataset-path lmarena-ai/VisionArena-Chat \
|
||||||
|
--num-prompts 1000 \
|
||||||
|
--hf-split train
|
||||||
|
```
|
||||||
|
|
||||||
|
The `num prompt tokens` now includes image token counts
|
||||||
|
|
||||||
|
```
|
||||||
|
Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
|
||||||
|
Total num prompt tokens: 14527
|
||||||
|
Total num output tokens: 1280
|
||||||
|
```
|
||||||
|
|
||||||
|
### InstructCoder Benchmark with Speculative Decoding
|
||||||
|
|
||||||
|
``` bash
|
||||||
|
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||||
|
VLLM_USE_V1=1 \
|
||||||
|
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||||
|
--dataset-name=hf \
|
||||||
|
--dataset-path=likaixin/InstructCoder \
|
||||||
|
--model=meta-llama/Meta-Llama-3-8B-Instruct \
|
||||||
|
--input-len=1000 \
|
||||||
|
--output-len=100 \
|
||||||
|
--num-prompts=2048 \
|
||||||
|
--async-engine \
|
||||||
|
--speculative-model="[ngram]" \
|
||||||
|
--ngram_prompt_lookup_min=2 \
|
||||||
|
--ngram-prompt-lookup-max=5 \
|
||||||
|
--num_speculative_tokens=5
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
|
||||||
|
Total num prompt tokens: 261136
|
||||||
|
Total num output tokens: 204800
|
||||||
|
```
|
||||||
|
|
||||||
|
### Other HuggingFaceDataset Examples
|
||||||
|
|
||||||
|
**`lmms-lab/LLaVA-OneVision-Data`**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||||
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
|
--backend vllm-chat \
|
||||||
|
--dataset-name hf \
|
||||||
|
--dataset-path lmms-lab/LLaVA-OneVision-Data \
|
||||||
|
--hf-split train \
|
||||||
|
--hf-subset "chart2text(cauldron)" \
|
||||||
|
--num-prompts 10
|
||||||
|
```
|
||||||
|
|
||||||
|
**`Aeala/ShareGPT_Vicuna_unfiltered`**
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||||
|
--model Qwen/Qwen2-VL-7B-Instruct \
|
||||||
|
--backend vllm-chat \
|
||||||
|
--dataset-name hf \
|
||||||
|
--dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
|
||||||
|
--hf-split train \
|
||||||
|
--num-prompts 10
|
||||||
|
```
|
||||||
|
|
||||||
|
### Benchmark with LoRA Adapters
|
||||||
|
|
||||||
|
``` bash
|
||||||
|
# download dataset
|
||||||
|
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||||
|
--model meta-llama/Llama-2-7b-hf \
|
||||||
|
--backend vllm \
|
||||||
|
--dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||||
|
--dataset_name sharegpt \
|
||||||
|
--num-prompts 10 \
|
||||||
|
--max-loras 2 \
|
||||||
|
--max-lora-rank 8 \
|
||||||
|
--enable-lora \
|
||||||
|
--lora-path yard1/llama-2-7b-sql-lora-test
|
||||||
|
```
|
||||||
|
|||||||
@ -1,10 +1,12 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import List, Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import huggingface_hub.constants
|
import huggingface_hub.constants
|
||||||
@ -12,6 +14,9 @@ from tqdm.asyncio import tqdm
|
|||||||
from transformers import (AutoTokenizer, PreTrainedTokenizer,
|
from transformers import (AutoTokenizer, PreTrainedTokenizer,
|
||||||
PreTrainedTokenizerFast)
|
PreTrainedTokenizerFast)
|
||||||
|
|
||||||
|
# NOTE(simon): do not import vLLM here so the benchmark script
|
||||||
|
# can run without vLLM installed.
|
||||||
|
|
||||||
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
|
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
|
||||||
|
|
||||||
|
|
||||||
@ -22,7 +27,7 @@ class RequestFuncInput:
|
|||||||
prompt_len: int
|
prompt_len: int
|
||||||
output_len: int
|
output_len: int
|
||||||
model: str
|
model: str
|
||||||
best_of: int = 1
|
model_name: Optional[str] = None
|
||||||
logprobs: Optional[int] = None
|
logprobs: Optional[int] = None
|
||||||
extra_body: Optional[dict] = None
|
extra_body: Optional[dict] = None
|
||||||
multi_modal_content: Optional[dict] = None
|
multi_modal_content: Optional[dict] = None
|
||||||
@ -34,9 +39,10 @@ class RequestFuncOutput:
|
|||||||
generated_text: str = ""
|
generated_text: str = ""
|
||||||
success: bool = False
|
success: bool = False
|
||||||
latency: float = 0.0
|
latency: float = 0.0
|
||||||
|
output_tokens: int = 0
|
||||||
ttft: float = 0.0 # Time to first token
|
ttft: float = 0.0 # Time to first token
|
||||||
itl: List[float] = field(
|
itl: list[float] = field(
|
||||||
default_factory=list) # List of inter-token latencies
|
default_factory=list) # list of inter-token latencies
|
||||||
tpot: float = 0.0 # avg next-token latencies
|
tpot: float = 0.0 # avg next-token latencies
|
||||||
prompt_len: int = 0
|
prompt_len: int = 0
|
||||||
error: str = ""
|
error: str = ""
|
||||||
@ -49,15 +55,15 @@ async def async_request_tgi(
|
|||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith("generate_stream")
|
assert api_url.endswith("generate_stream")
|
||||||
|
|
||||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
async with aiohttp.ClientSession(trust_env=True,
|
||||||
|
timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
params = {
|
params = {
|
||||||
"best_of": request_func_input.best_of,
|
|
||||||
"max_new_tokens": request_func_input.output_len,
|
"max_new_tokens": request_func_input.output_len,
|
||||||
"do_sample": True,
|
"do_sample": True,
|
||||||
"temperature": 0.01, # TGI does not accept 0.0 temperature.
|
"temperature": 0.01, # TGI does not accept 0.0 temperature.
|
||||||
"top_p": 0.99, # TGI does not accept 1.0 top_p.
|
"top_p": 0.99, # TGI does not accept 1.0 top_p.
|
||||||
"truncate": request_func_input.prompt_len,
|
"truncate": request_func_input.prompt_len,
|
||||||
# TGI does not accept ignore_eos flag.
|
"ignore_eos_token": request_func_input.ignore_eos,
|
||||||
}
|
}
|
||||||
payload = {
|
payload = {
|
||||||
"inputs": request_func_input.prompt,
|
"inputs": request_func_input.prompt,
|
||||||
@ -65,6 +71,10 @@ async def async_request_tgi(
|
|||||||
}
|
}
|
||||||
output = RequestFuncOutput()
|
output = RequestFuncOutput()
|
||||||
output.prompt_len = request_func_input.prompt_len
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
if request_func_input.ignore_eos:
|
||||||
|
output.output_tokens = request_func_input.output_len
|
||||||
|
else:
|
||||||
|
output.output_tokens = None
|
||||||
|
|
||||||
ttft = 0.0
|
ttft = 0.0
|
||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
@ -78,7 +88,7 @@ async def async_request_tgi(
|
|||||||
continue
|
continue
|
||||||
chunk_bytes = chunk_bytes.decode("utf-8")
|
chunk_bytes = chunk_bytes.decode("utf-8")
|
||||||
|
|
||||||
#NOTE: Sometimes TGI returns a ping response without
|
# NOTE: Sometimes TGI returns a ping response without
|
||||||
# any data, we should skip it.
|
# any data, we should skip it.
|
||||||
if chunk_bytes.startswith(":"):
|
if chunk_bytes.startswith(":"):
|
||||||
continue
|
continue
|
||||||
@ -121,8 +131,8 @@ async def async_request_trt_llm(
|
|||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith("generate_stream")
|
assert api_url.endswith("generate_stream")
|
||||||
|
|
||||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
async with aiohttp.ClientSession(trust_env=True,
|
||||||
assert request_func_input.best_of == 1
|
timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
payload = {
|
payload = {
|
||||||
"accumulate_tokens": True,
|
"accumulate_tokens": True,
|
||||||
"text_input": request_func_input.prompt,
|
"text_input": request_func_input.prompt,
|
||||||
@ -155,7 +165,7 @@ async def async_request_trt_llm(
|
|||||||
timestamp = time.perf_counter()
|
timestamp = time.perf_counter()
|
||||||
# First token
|
# First token
|
||||||
if ttft == 0.0:
|
if ttft == 0.0:
|
||||||
ttft = time.perf_counter() - st
|
ttft = timestamp - st
|
||||||
output.ttft = ttft
|
output.ttft = ttft
|
||||||
|
|
||||||
# Decoding phase
|
# Decoding phase
|
||||||
@ -185,8 +195,8 @@ async def async_request_deepspeed_mii(
|
|||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: Optional[tqdm] = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
async with aiohttp.ClientSession(trust_env=True,
|
||||||
assert request_func_input.best_of == 1
|
timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"prompt": request_func_input.prompt,
|
"prompt": request_func_input.prompt,
|
||||||
@ -233,17 +243,22 @@ async def async_request_openai_completions(
|
|||||||
("completions", "profile")
|
("completions", "profile")
|
||||||
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
|
), "OpenAI Completions API URL must end with 'completions' or 'profile'."
|
||||||
|
|
||||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
async with aiohttp.ClientSession(trust_env=True,
|
||||||
|
timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
payload = {
|
payload = {
|
||||||
"model": request_func_input.model,
|
"model": request_func_input.model_name \
|
||||||
|
if request_func_input.model_name else request_func_input.model,
|
||||||
"prompt": request_func_input.prompt,
|
"prompt": request_func_input.prompt,
|
||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
"best_of": request_func_input.best_of,
|
|
||||||
"max_tokens": request_func_input.output_len,
|
"max_tokens": request_func_input.output_len,
|
||||||
"logprobs": request_func_input.logprobs,
|
"logprobs": request_func_input.logprobs,
|
||||||
"stream": True,
|
"stream": True,
|
||||||
"ignore_eos": request_func_input.ignore_eos,
|
"stream_options": {
|
||||||
|
"include_usage": True,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
if request_func_input.ignore_eos:
|
||||||
|
payload["ignore_eos"] = request_func_input.ignore_eos
|
||||||
if request_func_input.extra_body:
|
if request_func_input.extra_body:
|
||||||
payload.update(request_func_input.extra_body)
|
payload.update(request_func_input.extra_body)
|
||||||
headers = {
|
headers = {
|
||||||
@ -254,7 +269,6 @@ async def async_request_openai_completions(
|
|||||||
output.prompt_len = request_func_input.prompt_len
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
generated_text = ""
|
generated_text = ""
|
||||||
ttft = 0.0
|
|
||||||
st = time.perf_counter()
|
st = time.perf_counter()
|
||||||
most_recent_timestamp = st
|
most_recent_timestamp = st
|
||||||
try:
|
try:
|
||||||
@ -269,15 +283,16 @@ async def async_request_openai_completions(
|
|||||||
|
|
||||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||||
"data: ")
|
"data: ")
|
||||||
if chunk == "[DONE]":
|
if chunk != "[DONE]":
|
||||||
latency = time.perf_counter() - st
|
|
||||||
else:
|
|
||||||
data = json.loads(chunk)
|
data = json.loads(chunk)
|
||||||
|
|
||||||
# NOTE: Some completion API might have a last
|
# NOTE: Some completion API might have a last
|
||||||
# usage summary response without a token so we
|
# usage summary response without a token so we
|
||||||
# want to check a token was generated
|
# want to check a token was generated
|
||||||
if data["choices"][0]["text"]:
|
if choices := data.get("choices"):
|
||||||
|
# Note that text could be empty here
|
||||||
|
# e.g. for special tokens
|
||||||
|
text = choices[0].get("text")
|
||||||
timestamp = time.perf_counter()
|
timestamp = time.perf_counter()
|
||||||
# First token
|
# First token
|
||||||
if not first_chunk_received:
|
if not first_chunk_received:
|
||||||
@ -291,7 +306,10 @@ async def async_request_openai_completions(
|
|||||||
most_recent_timestamp)
|
most_recent_timestamp)
|
||||||
|
|
||||||
most_recent_timestamp = timestamp
|
most_recent_timestamp = timestamp
|
||||||
generated_text += data["choices"][0]["text"]
|
generated_text += text or ""
|
||||||
|
elif usage := data.get("usage"):
|
||||||
|
output.output_tokens = usage.get(
|
||||||
|
"completion_tokens")
|
||||||
if first_chunk_received:
|
if first_chunk_received:
|
||||||
output.success = True
|
output.success = True
|
||||||
else:
|
else:
|
||||||
@ -300,7 +318,7 @@ async def async_request_openai_completions(
|
|||||||
"Never received a valid chunk to calculate TTFT."
|
"Never received a valid chunk to calculate TTFT."
|
||||||
"This response will be marked as failed!")
|
"This response will be marked as failed!")
|
||||||
output.generated_text = generated_text
|
output.generated_text = generated_text
|
||||||
output.latency = latency
|
output.latency = most_recent_timestamp - st
|
||||||
else:
|
else:
|
||||||
output.error = response.reason or ""
|
output.error = response.reason or ""
|
||||||
output.success = False
|
output.success = False
|
||||||
@ -320,15 +338,17 @@ async def async_request_openai_chat_completions(
|
|||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(
|
assert api_url.endswith(
|
||||||
"chat/completions"
|
("chat/completions", "profile")
|
||||||
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
|
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
|
||||||
|
|
||||||
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
async with aiohttp.ClientSession(trust_env=True,
|
||||||
|
timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
content = [{"type": "text", "text": request_func_input.prompt}]
|
content = [{"type": "text", "text": request_func_input.prompt}]
|
||||||
if request_func_input.multi_modal_content:
|
if request_func_input.multi_modal_content:
|
||||||
content.append(request_func_input.multi_modal_content)
|
content.append(request_func_input.multi_modal_content)
|
||||||
payload = {
|
payload = {
|
||||||
"model": request_func_input.model,
|
"model": request_func_input.model_name \
|
||||||
|
if request_func_input.model_name else request_func_input.model,
|
||||||
"messages": [
|
"messages": [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
@ -338,8 +358,12 @@ async def async_request_openai_chat_completions(
|
|||||||
"temperature": 0.0,
|
"temperature": 0.0,
|
||||||
"max_completion_tokens": request_func_input.output_len,
|
"max_completion_tokens": request_func_input.output_len,
|
||||||
"stream": True,
|
"stream": True,
|
||||||
"ignore_eos": request_func_input.ignore_eos,
|
"stream_options": {
|
||||||
|
"include_usage": True,
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
if request_func_input.ignore_eos:
|
||||||
|
payload["ignore_eos"] = request_func_input.ignore_eos
|
||||||
if request_func_input.extra_body:
|
if request_func_input.extra_body:
|
||||||
payload.update(request_func_input.extra_body)
|
payload.update(request_func_input.extra_body)
|
||||||
headers = {
|
headers = {
|
||||||
@ -365,17 +389,15 @@ async def async_request_openai_chat_completions(
|
|||||||
|
|
||||||
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
chunk = chunk_bytes.decode("utf-8").removeprefix(
|
||||||
"data: ")
|
"data: ")
|
||||||
if chunk == "[DONE]":
|
if chunk != "[DONE]":
|
||||||
latency = time.perf_counter() - st
|
|
||||||
else:
|
|
||||||
timestamp = time.perf_counter()
|
timestamp = time.perf_counter()
|
||||||
data = json.loads(chunk)
|
data = json.loads(chunk)
|
||||||
|
|
||||||
delta = data["choices"][0]["delta"]
|
if choices := data.get("choices"):
|
||||||
if delta.get("content", None):
|
content = choices[0]["delta"].get("content")
|
||||||
# First token
|
# First token
|
||||||
if ttft == 0.0:
|
if ttft == 0.0:
|
||||||
ttft = time.perf_counter() - st
|
ttft = timestamp - st
|
||||||
output.ttft = ttft
|
output.ttft = ttft
|
||||||
|
|
||||||
# Decoding phase
|
# Decoding phase
|
||||||
@ -383,13 +405,16 @@ async def async_request_openai_chat_completions(
|
|||||||
output.itl.append(timestamp -
|
output.itl.append(timestamp -
|
||||||
most_recent_timestamp)
|
most_recent_timestamp)
|
||||||
|
|
||||||
generated_text += delta["content"]
|
generated_text += content or ""
|
||||||
|
elif usage := data.get("usage"):
|
||||||
|
output.output_tokens = usage.get(
|
||||||
|
"completion_tokens")
|
||||||
|
|
||||||
most_recent_timestamp = timestamp
|
most_recent_timestamp = timestamp
|
||||||
|
|
||||||
output.generated_text = generated_text
|
output.generated_text = generated_text
|
||||||
output.success = True
|
output.success = True
|
||||||
output.latency = latency
|
output.latency = most_recent_timestamp - st
|
||||||
else:
|
else:
|
||||||
output.error = response.reason or ""
|
output.error = response.reason or ""
|
||||||
output.success = False
|
output.success = False
|
||||||
@ -407,24 +432,50 @@ def get_model(pretrained_model_name_or_path: str) -> str:
|
|||||||
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
|
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
|
||||||
from modelscope import snapshot_download
|
from modelscope import snapshot_download
|
||||||
|
|
||||||
model_path = snapshot_download(
|
from vllm.model_executor.model_loader.weight_utils import get_lock
|
||||||
model_id=pretrained_model_name_or_path,
|
|
||||||
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
|
||||||
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
|
|
||||||
|
|
||||||
return model_path
|
# Use file lock to prevent multiple processes from
|
||||||
|
# downloading the same model weights at the same time.
|
||||||
|
with get_lock(pretrained_model_name_or_path):
|
||||||
|
model_path = snapshot_download(
|
||||||
|
model_id=pretrained_model_name_or_path,
|
||||||
|
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
||||||
|
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
|
||||||
|
|
||||||
|
return model_path
|
||||||
return pretrained_model_name_or_path
|
return pretrained_model_name_or_path
|
||||||
|
|
||||||
|
|
||||||
def get_tokenizer(
|
def get_tokenizer(
|
||||||
pretrained_model_name_or_path: str, trust_remote_code: bool
|
pretrained_model_name_or_path: str,
|
||||||
|
tokenizer_mode: str = "auto",
|
||||||
|
trust_remote_code: bool = False,
|
||||||
|
**kwargs,
|
||||||
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
||||||
if pretrained_model_name_or_path is not None and not os.path.exists(
|
if pretrained_model_name_or_path is not None and not os.path.exists(
|
||||||
pretrained_model_name_or_path):
|
pretrained_model_name_or_path):
|
||||||
pretrained_model_name_or_path = get_model(
|
pretrained_model_name_or_path = get_model(
|
||||||
pretrained_model_name_or_path)
|
pretrained_model_name_or_path)
|
||||||
return AutoTokenizer.from_pretrained(pretrained_model_name_or_path,
|
if tokenizer_mode == "slow":
|
||||||
trust_remote_code=trust_remote_code)
|
if kwargs.get("use_fast", False):
|
||||||
|
raise ValueError(
|
||||||
|
"Cannot use the fast tokenizer in slow tokenizer mode.")
|
||||||
|
kwargs["use_fast"] = False
|
||||||
|
if tokenizer_mode == "mistral":
|
||||||
|
try:
|
||||||
|
from vllm.transformers_utils.tokenizer import MistralTokenizer
|
||||||
|
except ImportError as e:
|
||||||
|
raise ImportError("MistralTokenizer requires vllm package.\n"
|
||||||
|
"Please install it with `pip install vllm` "
|
||||||
|
"to use mistral tokenizer mode.") from e
|
||||||
|
return MistralTokenizer.from_pretrained(
|
||||||
|
str(pretrained_model_name_or_path))
|
||||||
|
else:
|
||||||
|
return AutoTokenizer.from_pretrained(
|
||||||
|
pretrained_model_name_or_path,
|
||||||
|
trust_remote_code=trust_remote_code,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
ASYNC_REQUEST_FUNCS = {
|
ASYNC_REQUEST_FUNCS = {
|
||||||
|
|||||||
763
benchmarks/benchmark_dataset.py
Normal file
763
benchmarks/benchmark_dataset.py
Normal file
@ -0,0 +1,763 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
"""
|
||||||
|
This module defines a framework for sampling benchmark requests from various
|
||||||
|
datasets. Each dataset subclass of BenchmarkDataset must implement sample
|
||||||
|
generation. Supported dataset types include:
|
||||||
|
- ShareGPT
|
||||||
|
- Random (synthetic)
|
||||||
|
- Sonnet
|
||||||
|
- BurstGPT
|
||||||
|
- HuggingFace
|
||||||
|
- VisionArena
|
||||||
|
|
||||||
|
TODO: Implement CustomDataset to parse a JSON file and convert its contents into
|
||||||
|
SampleRequest instances, similar to the approach used in ShareGPT.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import io
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import random
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from collections.abc import Mapping
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from functools import cache
|
||||||
|
from io import BytesIO
|
||||||
|
from typing import Any, Callable, Optional, Union
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from datasets import load_dataset
|
||||||
|
from PIL import Image
|
||||||
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
|
from vllm.lora.request import LoRARequest
|
||||||
|
from vllm.lora.utils import get_adapter_absolute_path
|
||||||
|
from vllm.multimodal import MultiModalDataDict
|
||||||
|
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Data Classes
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SampleRequest:
|
||||||
|
"""
|
||||||
|
Represents a single inference request for benchmarking.
|
||||||
|
"""
|
||||||
|
|
||||||
|
prompt: Union[str, Any]
|
||||||
|
prompt_len: int
|
||||||
|
expected_output_len: int
|
||||||
|
multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None
|
||||||
|
lora_request: Optional[LoRARequest] = None
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Benchmark Dataset Base Class
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class BenchmarkDataset(ABC):
|
||||||
|
DEFAULT_SEED = 0
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
dataset_path: Optional[str] = None,
|
||||||
|
random_seed: int = DEFAULT_SEED,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Initialize the BenchmarkDataset with an optional dataset path and random
|
||||||
|
seed. Args:
|
||||||
|
dataset_path (Optional[str]): Path to the dataset. If None, it
|
||||||
|
indicates that a default or random dataset might be used.
|
||||||
|
random_seed (int): Seed value for reproducible shuffling or
|
||||||
|
sampling. Defaults to DEFAULT_SEED.
|
||||||
|
"""
|
||||||
|
self.dataset_path = dataset_path
|
||||||
|
# Set the random seed, ensuring that a None value is replaced with the
|
||||||
|
# default seed.
|
||||||
|
self.random_seed = (random_seed
|
||||||
|
if random_seed is not None else self.DEFAULT_SEED)
|
||||||
|
self.data = None
|
||||||
|
|
||||||
|
def apply_multimodal_chat_transformation(
|
||||||
|
self,
|
||||||
|
prompt: str,
|
||||||
|
mm_content: Optional[MultiModalDataDict] = None) -> list[dict]:
|
||||||
|
"""
|
||||||
|
Transform a prompt and optional multimodal content into a chat format.
|
||||||
|
This method is used for chat models that expect a specific conversation
|
||||||
|
format.
|
||||||
|
"""
|
||||||
|
content = [{"text": prompt, "type": "text"}]
|
||||||
|
if mm_content is not None:
|
||||||
|
content.append(mm_content)
|
||||||
|
return [{"role": "user", "content": content}]
|
||||||
|
|
||||||
|
def load_data(self) -> None:
|
||||||
|
"""
|
||||||
|
Load data from the dataset path into self.data.
|
||||||
|
|
||||||
|
This method must be overridden by subclasses since the method to load
|
||||||
|
data will vary depending on the dataset format and source.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
NotImplementedError: If a subclass does not implement this method.
|
||||||
|
"""
|
||||||
|
# TODO (jenniferzhao): add support for downloading data
|
||||||
|
raise NotImplementedError(
|
||||||
|
"load_data must be implemented in subclasses.")
|
||||||
|
|
||||||
|
def get_random_lora_request(
|
||||||
|
self,
|
||||||
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
|
max_loras: Optional[int] = None,
|
||||||
|
lora_path: Optional[str] = None,
|
||||||
|
) -> tuple[Optional[LoRARequest], AnyTokenizer]:
|
||||||
|
"""
|
||||||
|
Optionally select a random LoRA request and return its associated
|
||||||
|
tokenizer.
|
||||||
|
|
||||||
|
This method is used when LoRA parameters are provided. It randomly
|
||||||
|
selects a LoRA based on max_loras and retrieves a cached tokenizer for
|
||||||
|
that LoRA if available. Otherwise, it returns the base tokenizer.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
|
||||||
|
LoRA is selected. max_loras (Optional[int]): The maximum number of
|
||||||
|
LoRAs available. If None, LoRA is not used. lora_path
|
||||||
|
(Optional[str]): Path to the LoRA parameters on disk. If None, LoRA
|
||||||
|
is not used.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first
|
||||||
|
element is a LoRARequest (or None if not applicable) and the second
|
||||||
|
element is the tokenizer associated with the LoRA request (or the
|
||||||
|
base tokenizer).
|
||||||
|
"""
|
||||||
|
if max_loras is None or lora_path is None:
|
||||||
|
return None, tokenizer
|
||||||
|
|
||||||
|
# Generate a random LoRA ID in the range [1, max_loras].
|
||||||
|
lora_id = random.randint(1, max_loras)
|
||||||
|
lora_request = LoRARequest(
|
||||||
|
lora_name=str(lora_id),
|
||||||
|
lora_int_id=lora_id,
|
||||||
|
lora_path=lora_path_on_disk(lora_path),
|
||||||
|
)
|
||||||
|
if lora_id not in lora_tokenizer_cache:
|
||||||
|
lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
|
||||||
|
# Return lora_request and the cached tokenizer if available; otherwise,
|
||||||
|
# return the base tokenizer
|
||||||
|
return lora_request, lora_tokenizer_cache[lora_id] or tokenizer
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def sample(self, tokenizer: PreTrainedTokenizerBase,
|
||||||
|
num_requests: int) -> list[SampleRequest]:
|
||||||
|
"""
|
||||||
|
Abstract method to generate sample requests from the dataset.
|
||||||
|
|
||||||
|
Subclasses must override this method to implement dataset-specific logic
|
||||||
|
for generating a list of SampleRequest objects.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
|
||||||
|
for processing the dataset's text.
|
||||||
|
num_requests (int): The number of sample requests to generate.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list[SampleRequest]: A list of sample requests generated from the
|
||||||
|
dataset.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError("sample must be implemented in subclasses.")
|
||||||
|
|
||||||
|
def maybe_oversample_requests(self, requests: list[SampleRequest],
|
||||||
|
num_requests: int) -> None:
|
||||||
|
"""
|
||||||
|
Oversamples the list of requests if its size is less than the desired
|
||||||
|
number.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
requests (List[SampleRequest]): The current list of sampled
|
||||||
|
requests. num_requests (int): The target number of requests.
|
||||||
|
"""
|
||||||
|
if len(requests) < num_requests:
|
||||||
|
random.seed(self.random_seed)
|
||||||
|
additional = random.choices(requests,
|
||||||
|
k=num_requests - len(requests))
|
||||||
|
requests.extend(additional)
|
||||||
|
logger.info("Oversampled requests to reach %d total samples.",
|
||||||
|
num_requests)
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Utility Functions and Global Caches
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_sequence(
|
||||||
|
prompt_len: int,
|
||||||
|
output_len: int,
|
||||||
|
min_len: int = 4,
|
||||||
|
max_prompt_len: int = 1024,
|
||||||
|
max_total_len: int = 2048,
|
||||||
|
skip_min_output_len_check: bool = False,
|
||||||
|
) -> bool:
|
||||||
|
"""
|
||||||
|
Validate a sequence based on prompt and output lengths.
|
||||||
|
|
||||||
|
Default pruning criteria are copied from the original `sample_hf_requests`
|
||||||
|
and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
|
||||||
|
from `sample_requests` in benchmark_throughput.py.
|
||||||
|
"""
|
||||||
|
# Check for invalid conditions
|
||||||
|
prompt_too_short = prompt_len < min_len
|
||||||
|
output_too_short = (not skip_min_output_len_check) and (output_len
|
||||||
|
< min_len)
|
||||||
|
prompt_too_long = prompt_len > max_prompt_len
|
||||||
|
combined_too_long = (prompt_len + output_len) > max_total_len
|
||||||
|
|
||||||
|
# Return True if none of the invalid conditions are met
|
||||||
|
return not (prompt_too_short or output_too_short or prompt_too_long
|
||||||
|
or combined_too_long)
|
||||||
|
|
||||||
|
|
||||||
|
@cache
|
||||||
|
def lora_path_on_disk(lora_path: str) -> str:
|
||||||
|
return get_adapter_absolute_path(lora_path)
|
||||||
|
|
||||||
|
|
||||||
|
# Global cache for LoRA tokenizers.
|
||||||
|
lora_tokenizer_cache: dict[int, AnyTokenizer] = {}
|
||||||
|
|
||||||
|
|
||||||
|
def process_image(image: Any) -> Mapping[str, Any]:
|
||||||
|
"""
|
||||||
|
Process a single image input and return a multimedia content dictionary.
|
||||||
|
|
||||||
|
Supports three input types:
|
||||||
|
|
||||||
|
1. Dictionary with raw image bytes: - Expects a dict with a 'bytes' key
|
||||||
|
containing raw image data. - Loads the bytes as a PIL.Image.Image.
|
||||||
|
|
||||||
|
2. PIL.Image.Image input: - Converts the image to RGB. - Saves the image as
|
||||||
|
a JPEG in memory. - Encodes the JPEG data as a base64 string. - Returns
|
||||||
|
a dictionary with the image as a base64 data URL.
|
||||||
|
|
||||||
|
3. String input: - Treats the string as a URL or local file path. -
|
||||||
|
Prepends "file://" if the string doesn't start with "http://" or
|
||||||
|
"file://". - Returns a dictionary with the image URL.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the input is not a supported type.
|
||||||
|
"""
|
||||||
|
if isinstance(image, dict) and 'bytes' in image:
|
||||||
|
image = Image.open(BytesIO(image['bytes']))
|
||||||
|
if isinstance(image, Image.Image):
|
||||||
|
image = image.convert("RGB")
|
||||||
|
with io.BytesIO() as image_data:
|
||||||
|
image.save(image_data, format="JPEG")
|
||||||
|
image_base64 = base64.b64encode(
|
||||||
|
image_data.getvalue()).decode("utf-8")
|
||||||
|
return {
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {
|
||||||
|
"url": f"data:image/jpeg;base64,{image_base64}"
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
if isinstance(image, str):
|
||||||
|
image_url = (image if image.startswith(
|
||||||
|
("http://", "file://")) else f"file://{image}")
|
||||||
|
return {"type": "image_url", "image_url": {"url": image_url}}
|
||||||
|
|
||||||
|
raise ValueError(f"Invalid image input {image}. Must be a PIL.Image.Image"
|
||||||
|
" or str or dictionary with raw image bytes.")
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Random Dataset Implementation (Synthetic Data)
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class RandomDataset(BenchmarkDataset):
|
||||||
|
# Default values copied from benchmark_serving.py for the random dataset.
|
||||||
|
DEFAULT_PREFIX_LEN = 0
|
||||||
|
DEFAULT_RANGE_RATIO = 1.0
|
||||||
|
DEFAULT_INPUT_LEN = 1024
|
||||||
|
DEFAULT_OUTPUT_LEN = 128
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
**kwargs,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
|
||||||
|
def sample(
|
||||||
|
self,
|
||||||
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
|
num_requests: int,
|
||||||
|
prefix_len: int = DEFAULT_PREFIX_LEN,
|
||||||
|
range_ratio: float = DEFAULT_RANGE_RATIO,
|
||||||
|
input_len: int = DEFAULT_INPUT_LEN,
|
||||||
|
output_len: int = DEFAULT_OUTPUT_LEN,
|
||||||
|
**kwargs,
|
||||||
|
) -> list[SampleRequest]:
|
||||||
|
vocab_size = tokenizer.vocab_size
|
||||||
|
|
||||||
|
prefix_token_ids = (np.random.randint(
|
||||||
|
0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
|
||||||
|
|
||||||
|
input_low = int(input_len * range_ratio)
|
||||||
|
output_low = int(output_len * range_ratio)
|
||||||
|
|
||||||
|
input_lens = np.random.randint(input_low,
|
||||||
|
input_len + 1,
|
||||||
|
size=num_requests)
|
||||||
|
output_lens = np.random.randint(output_low,
|
||||||
|
output_len + 1,
|
||||||
|
size=num_requests)
|
||||||
|
offsets = np.random.randint(0, vocab_size, size=num_requests)
|
||||||
|
|
||||||
|
requests = []
|
||||||
|
for i in range(num_requests):
|
||||||
|
inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) %
|
||||||
|
vocab_size).tolist()
|
||||||
|
token_sequence = prefix_token_ids + inner_seq
|
||||||
|
prompt = tokenizer.decode(token_sequence)
|
||||||
|
total_input_len = prefix_len + int(input_lens[i])
|
||||||
|
requests.append(
|
||||||
|
SampleRequest(
|
||||||
|
prompt=prompt,
|
||||||
|
prompt_len=total_input_len,
|
||||||
|
expected_output_len=int(output_lens[i]),
|
||||||
|
))
|
||||||
|
return requests
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# ShareGPT Dataset Implementation
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class ShareGPTDataset(BenchmarkDataset):
|
||||||
|
"""
|
||||||
|
Implements the ShareGPT dataset. Loads data from a JSON file and generates
|
||||||
|
sample requests based on conversation turns.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, **kwargs) -> None:
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
self.load_data()
|
||||||
|
|
||||||
|
def load_data(self) -> None:
|
||||||
|
if self.dataset_path is None:
|
||||||
|
raise ValueError("dataset_path must be provided for loading data.")
|
||||||
|
|
||||||
|
with open(self.dataset_path, encoding="utf-8") as f:
|
||||||
|
self.data = json.load(f)
|
||||||
|
# Filter entries with at least two conversation turns.
|
||||||
|
self.data = [
|
||||||
|
entry for entry in self.data
|
||||||
|
if "conversations" in entry and len(entry["conversations"]) >= 2
|
||||||
|
]
|
||||||
|
random.seed(self.random_seed)
|
||||||
|
random.shuffle(self.data)
|
||||||
|
|
||||||
|
def sample(
|
||||||
|
self,
|
||||||
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
|
num_requests: int,
|
||||||
|
lora_path: Optional[str] = None,
|
||||||
|
max_loras: Optional[int] = None,
|
||||||
|
output_len: Optional[int] = None,
|
||||||
|
enable_multimodal_chat: bool = False,
|
||||||
|
**kwargs,
|
||||||
|
) -> list:
|
||||||
|
samples: list = []
|
||||||
|
for entry in self.data:
|
||||||
|
if len(samples) >= num_requests:
|
||||||
|
break
|
||||||
|
prompt, completion = (
|
||||||
|
entry["conversations"][0]["value"],
|
||||||
|
entry["conversations"][1]["value"],
|
||||||
|
)
|
||||||
|
|
||||||
|
lora_request, tokenizer = self.get_random_lora_request(
|
||||||
|
tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
|
||||||
|
prompt_ids = tokenizer(prompt).input_ids
|
||||||
|
completion_ids = tokenizer(completion).input_ids
|
||||||
|
prompt_len = len(prompt_ids)
|
||||||
|
new_output_len = (len(completion_ids)
|
||||||
|
if output_len is None else output_len)
|
||||||
|
if not is_valid_sequence(prompt_len,
|
||||||
|
new_output_len,
|
||||||
|
skip_min_output_len_check=output_len
|
||||||
|
is not None):
|
||||||
|
continue
|
||||||
|
if enable_multimodal_chat:
|
||||||
|
prompt = self.apply_multimodal_chat_transformation(
|
||||||
|
prompt, None)
|
||||||
|
samples.append(
|
||||||
|
SampleRequest(
|
||||||
|
prompt=prompt,
|
||||||
|
prompt_len=prompt_len,
|
||||||
|
expected_output_len=new_output_len,
|
||||||
|
lora_request=lora_request,
|
||||||
|
))
|
||||||
|
self.maybe_oversample_requests(samples, num_requests)
|
||||||
|
return samples
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Sonnet Dataset Implementation
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class SonnetDataset(BenchmarkDataset):
|
||||||
|
"""
|
||||||
|
Simplified implementation of the Sonnet dataset. Loads poem lines from a
|
||||||
|
text file and generates sample requests. Default values here copied from
|
||||||
|
`benchmark_serving.py` for the sonnet dataset.
|
||||||
|
"""
|
||||||
|
|
||||||
|
DEFAULT_PREFIX_LEN = 200
|
||||||
|
DEFAULT_INPUT_LEN = 550
|
||||||
|
DEFAULT_OUTPUT_LEN = 150
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
**kwargs,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
self.load_data()
|
||||||
|
|
||||||
|
def load_data(self) -> None:
|
||||||
|
if not self.dataset_path:
|
||||||
|
raise ValueError("dataset_path must be provided.")
|
||||||
|
with open(self.dataset_path, encoding="utf-8") as f:
|
||||||
|
self.data = f.readlines()
|
||||||
|
|
||||||
|
def sample(
|
||||||
|
self,
|
||||||
|
tokenizer,
|
||||||
|
num_requests: int,
|
||||||
|
prefix_len: int = DEFAULT_PREFIX_LEN,
|
||||||
|
input_len: int = DEFAULT_INPUT_LEN,
|
||||||
|
output_len: int = DEFAULT_OUTPUT_LEN,
|
||||||
|
return_prompt_formatted: bool = False,
|
||||||
|
**kwargs,
|
||||||
|
) -> list:
|
||||||
|
# Calculate average token length for a poem line.
|
||||||
|
tokenized_lines = [tokenizer(line).input_ids for line in self.data]
|
||||||
|
avg_len = sum(len(tokens)
|
||||||
|
for tokens in tokenized_lines) / len(tokenized_lines)
|
||||||
|
|
||||||
|
# Build the base prompt.
|
||||||
|
base_prompt = "Pick as many lines as you can from these poem lines:\n"
|
||||||
|
base_msg = [{"role": "user", "content": base_prompt}]
|
||||||
|
base_fmt = tokenizer.apply_chat_template(base_msg,
|
||||||
|
add_generation_prompt=True,
|
||||||
|
tokenize=False)
|
||||||
|
base_offset = len(tokenizer(base_fmt).input_ids)
|
||||||
|
if input_len <= base_offset:
|
||||||
|
raise ValueError(
|
||||||
|
f"'input_len' must be higher than the base prompt length "
|
||||||
|
f"({base_offset}).")
|
||||||
|
|
||||||
|
# Determine how many poem lines to use.
|
||||||
|
num_input_lines = round((input_len - base_offset) / avg_len)
|
||||||
|
num_prefix_lines = round((prefix_len - base_offset) / avg_len)
|
||||||
|
prefix_lines = self.data[:num_prefix_lines]
|
||||||
|
|
||||||
|
samples = []
|
||||||
|
for _ in range(num_requests):
|
||||||
|
extra_lines = random.choices(self.data,
|
||||||
|
k=num_input_lines - num_prefix_lines)
|
||||||
|
prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
|
||||||
|
msg = [{"role": "user", "content": prompt}]
|
||||||
|
prompt_formatted = tokenizer.apply_chat_template(
|
||||||
|
msg, add_generation_prompt=True, tokenize=False)
|
||||||
|
prompt_len = len(tokenizer(prompt_formatted).input_ids)
|
||||||
|
samples.append(
|
||||||
|
SampleRequest(
|
||||||
|
prompt=prompt_formatted
|
||||||
|
if return_prompt_formatted else prompt,
|
||||||
|
prompt_len=prompt_len,
|
||||||
|
expected_output_len=output_len,
|
||||||
|
))
|
||||||
|
return samples
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# BurstGPT Dataset Implementation
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class BurstGPTDataset(BenchmarkDataset):
|
||||||
|
"""
|
||||||
|
Implements the BurstGPT dataset. Loads data from a CSV file and generates
|
||||||
|
sample requests based on synthetic prompt generation. Only rows with Model
|
||||||
|
"GPT-4" and positive response tokens are used.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, **kwargs) -> None:
|
||||||
|
super().__init__(**kwargs)
|
||||||
|
self.load_data()
|
||||||
|
|
||||||
|
def load_data(self, ):
|
||||||
|
if self.dataset_path is None:
|
||||||
|
raise ValueError("dataset_path must be provided for loading data.")
|
||||||
|
|
||||||
|
df = pd.read_csv(self.dataset_path)
|
||||||
|
# Filter to keep only GPT-4 rows.
|
||||||
|
gpt4_df = df[df["Model"] == "GPT-4"]
|
||||||
|
# Remove failed requests (where Response tokens is 0 or less).
|
||||||
|
gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
|
||||||
|
# Sample the desired number of rows.
|
||||||
|
self.data = gpt4_df
|
||||||
|
|
||||||
|
def _sample_loaded_data(self, num_requests: int) -> list:
|
||||||
|
if num_requests <= len(self.data):
|
||||||
|
data = self.data.sample(n=num_requests,
|
||||||
|
random_state=self.random_seed)
|
||||||
|
else:
|
||||||
|
data = self.data.sample(
|
||||||
|
n=num_requests,
|
||||||
|
random_state=self.random_seed,
|
||||||
|
replace=True,
|
||||||
|
)
|
||||||
|
# Convert the dataframe to a list of lists.
|
||||||
|
return data.values.tolist()
|
||||||
|
|
||||||
|
def sample(
|
||||||
|
self,
|
||||||
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
|
num_requests: int,
|
||||||
|
max_loras: Optional[int] = None,
|
||||||
|
lora_path: Optional[str] = None,
|
||||||
|
**kwargs,
|
||||||
|
) -> list[SampleRequest]:
|
||||||
|
samples = []
|
||||||
|
data = self._sample_loaded_data(num_requests=num_requests)
|
||||||
|
for i in range(num_requests):
|
||||||
|
input_len = int(data[i][2])
|
||||||
|
output_len = int(data[i][3])
|
||||||
|
lora_req, tokenizer = self.get_random_lora_request(
|
||||||
|
tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
|
||||||
|
vocab_size = tokenizer.vocab_size
|
||||||
|
# Generate a synthetic prompt: a list of token IDs computed as (i +
|
||||||
|
# j) modulo vocab_size.
|
||||||
|
token_ids = [(i + j) % vocab_size for j in range(input_len)]
|
||||||
|
prompt = tokenizer.decode(token_ids)
|
||||||
|
samples.append(
|
||||||
|
SampleRequest(
|
||||||
|
prompt=prompt,
|
||||||
|
prompt_len=input_len,
|
||||||
|
expected_output_len=output_len,
|
||||||
|
lora_request=lora_req,
|
||||||
|
))
|
||||||
|
return samples
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# HuggingFace Dataset Base Implementation
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
class HuggingFaceDataset(BenchmarkDataset):
|
||||||
|
"""Base class for datasets hosted on HuggingFace."""
|
||||||
|
|
||||||
|
SUPPORTED_DATASET_PATHS: Union[set[str], dict[str, Callable]] = set()
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
dataset_path: str,
|
||||||
|
dataset_split: str,
|
||||||
|
dataset_subset: Optional[str] = None,
|
||||||
|
**kwargs,
|
||||||
|
) -> None:
|
||||||
|
super().__init__(dataset_path=dataset_path, **kwargs)
|
||||||
|
|
||||||
|
# Validate dataset path
|
||||||
|
if self.SUPPORTED_DATASET_PATHS and \
|
||||||
|
self.dataset_path not in self.SUPPORTED_DATASET_PATHS:
|
||||||
|
raise ValueError(
|
||||||
|
f"{self.__class__.__name__} "
|
||||||
|
f"only supports: {', '.join(self.SUPPORTED_DATASET_PATHS)}. "
|
||||||
|
"Please consider contributing if you would "
|
||||||
|
"like to add support for additional dataset formats.")
|
||||||
|
|
||||||
|
self.dataset_split = dataset_split
|
||||||
|
self.dataset_subset = dataset_subset
|
||||||
|
self.load_data()
|
||||||
|
|
||||||
|
def load_data(self) -> None:
|
||||||
|
"""Load data from HuggingFace datasets."""
|
||||||
|
self.data = load_dataset(
|
||||||
|
self.dataset_path,
|
||||||
|
name=self.dataset_subset,
|
||||||
|
split=self.dataset_split,
|
||||||
|
streaming=True,
|
||||||
|
)
|
||||||
|
self.data = self.data.shuffle(seed=self.random_seed)
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Conversation Dataset Implementation
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class ConversationDataset(HuggingFaceDataset):
|
||||||
|
"""Dataset for conversation data with multimodal support."""
|
||||||
|
SUPPORTED_DATASET_PATHS = {
|
||||||
|
'lmms-lab/LLaVA-OneVision-Data', 'Aeala/ShareGPT_Vicuna_unfiltered'
|
||||||
|
}
|
||||||
|
|
||||||
|
def sample(self,
|
||||||
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
|
num_requests: int,
|
||||||
|
output_len: Optional[int] = None,
|
||||||
|
enable_multimodal_chat: bool = False,
|
||||||
|
**kwargs) -> list:
|
||||||
|
# Filter examples with at least 2 conversations
|
||||||
|
filtered_data = self.data.filter(
|
||||||
|
lambda x: len(x["conversations"]) >= 2)
|
||||||
|
sampled_requests = []
|
||||||
|
dynamic_output = output_len is None
|
||||||
|
|
||||||
|
for item in filtered_data:
|
||||||
|
if len(sampled_requests) >= num_requests:
|
||||||
|
break
|
||||||
|
conv = item["conversations"]
|
||||||
|
prompt, completion = conv[0]["value"], conv[1]["value"]
|
||||||
|
|
||||||
|
prompt_ids = tokenizer(prompt).input_ids
|
||||||
|
completion_ids = tokenizer(completion).input_ids
|
||||||
|
prompt_len = len(prompt_ids)
|
||||||
|
completion_len = len(completion_ids)
|
||||||
|
output_len = completion_len if dynamic_output else output_len
|
||||||
|
assert isinstance(output_len, int) and output_len > 0
|
||||||
|
if dynamic_output and not is_valid_sequence(
|
||||||
|
prompt_len, completion_len):
|
||||||
|
continue
|
||||||
|
mm_content = process_image(
|
||||||
|
item["image"]) if "image" in item else None
|
||||||
|
if enable_multimodal_chat:
|
||||||
|
# Note: when chat is enabled the request prompt_len is no longer
|
||||||
|
# accurate and we will be using request output to count the
|
||||||
|
# actual prompt len and output len
|
||||||
|
prompt = self.apply_multimodal_chat_transformation(
|
||||||
|
prompt, mm_content)
|
||||||
|
sampled_requests.append(
|
||||||
|
SampleRequest(
|
||||||
|
prompt=prompt,
|
||||||
|
prompt_len=prompt_len,
|
||||||
|
expected_output_len=output_len,
|
||||||
|
multi_modal_data=mm_content,
|
||||||
|
))
|
||||||
|
self.maybe_oversample_requests(sampled_requests, num_requests)
|
||||||
|
return sampled_requests
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Vision Arena Dataset Implementation
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class VisionArenaDataset(HuggingFaceDataset):
|
||||||
|
"""
|
||||||
|
Vision Arena Dataset.
|
||||||
|
"""
|
||||||
|
|
||||||
|
DEFAULT_OUTPUT_LEN = 128
|
||||||
|
SUPPORTED_DATASET_PATHS = {
|
||||||
|
"lmarena-ai/VisionArena-Chat":
|
||||||
|
lambda x: x["conversation"][0][0]["content"],
|
||||||
|
"lmarena-ai/vision-arena-bench-v0.1":
|
||||||
|
lambda x: x["turns"][0][0]["content"]
|
||||||
|
}
|
||||||
|
|
||||||
|
def sample(
|
||||||
|
self,
|
||||||
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
|
num_requests: int,
|
||||||
|
output_len: Optional[int] = None,
|
||||||
|
enable_multimodal_chat: bool = False,
|
||||||
|
**kwargs,
|
||||||
|
) -> list:
|
||||||
|
output_len = (output_len
|
||||||
|
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
|
||||||
|
sampled_requests = []
|
||||||
|
for item in self.data:
|
||||||
|
if len(sampled_requests) >= num_requests:
|
||||||
|
break
|
||||||
|
parser_fn = self.SUPPORTED_DATASET_PATHS.get(self.dataset_path)
|
||||||
|
if parser_fn is None:
|
||||||
|
raise ValueError(
|
||||||
|
f"Unsupported dataset path: {self.dataset_path}")
|
||||||
|
prompt = parser_fn(item)
|
||||||
|
mm_content = process_image(item["images"][0])
|
||||||
|
prompt_len = len(tokenizer(prompt).input_ids)
|
||||||
|
if enable_multimodal_chat:
|
||||||
|
# Note: when chat is enabled the request prompt_len is no longer
|
||||||
|
# accurate and we will be using request output to count the
|
||||||
|
# actual prompt len
|
||||||
|
prompt = self.apply_multimodal_chat_transformation(
|
||||||
|
prompt, mm_content)
|
||||||
|
sampled_requests.append(
|
||||||
|
SampleRequest(
|
||||||
|
prompt=prompt,
|
||||||
|
prompt_len=prompt_len,
|
||||||
|
expected_output_len=output_len,
|
||||||
|
multi_modal_data=mm_content,
|
||||||
|
))
|
||||||
|
self.maybe_oversample_requests(sampled_requests, num_requests)
|
||||||
|
return sampled_requests
|
||||||
|
|
||||||
|
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
# Instruct Coder Dataset Implementation
|
||||||
|
# -----------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class InstructCoderDataset(HuggingFaceDataset):
|
||||||
|
"""
|
||||||
|
InstructCoder Dataset.
|
||||||
|
https://huggingface.co/datasets/likaixin/InstructCoder
|
||||||
|
|
||||||
|
InstructCoder is the dataset designed for general code editing. It consists
|
||||||
|
of 114,239 instruction-input-output triplets, and covers multiple distinct
|
||||||
|
code editing scenario.
|
||||||
|
"""
|
||||||
|
|
||||||
|
DEFAULT_OUTPUT_LEN = 200 # this is the average default output length
|
||||||
|
SUPPORTED_DATASET_PATHS = {
|
||||||
|
"likaixin/InstructCoder",
|
||||||
|
}
|
||||||
|
|
||||||
|
def sample(self,
|
||||||
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
|
num_requests: int,
|
||||||
|
output_len: Optional[int] = None,
|
||||||
|
enable_multimodal_chat: bool = False,
|
||||||
|
**kwargs) -> list:
|
||||||
|
output_len = (output_len
|
||||||
|
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
|
||||||
|
sampled_requests = []
|
||||||
|
for item in self.data:
|
||||||
|
if len(sampled_requests) >= num_requests:
|
||||||
|
break
|
||||||
|
prompt = f"{item['instruction']}:\n{item['input']}"
|
||||||
|
prompt_len = len(tokenizer(prompt).input_ids)
|
||||||
|
sampled_requests.append(
|
||||||
|
SampleRequest(
|
||||||
|
prompt=prompt,
|
||||||
|
prompt_len=prompt_len,
|
||||||
|
expected_output_len=output_len,
|
||||||
|
))
|
||||||
|
self.maybe_oversample_requests(sampled_requests, num_requests)
|
||||||
|
return sampled_requests
|
||||||
@ -1,494 +0,0 @@
|
|||||||
"""Benchmark guided decoding throughput."""
|
|
||||||
import argparse
|
|
||||||
import dataclasses
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import random
|
|
||||||
import time
|
|
||||||
from typing import List
|
|
||||||
|
|
||||||
import datasets
|
|
||||||
import pandas as pd
|
|
||||||
import uvloop
|
|
||||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
|
||||||
|
|
||||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
|
||||||
from vllm.entrypoints.openai.api_server import (
|
|
||||||
build_async_engine_client_from_engine_args)
|
|
||||||
from vllm.sampling_params import GuidedDecodingParams
|
|
||||||
from vllm.utils import FlexibleArgumentParser, merge_async_iterators
|
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass
|
|
||||||
class SampleRequest:
|
|
||||||
"""A class representing a single inference request for benchmarking.
|
|
||||||
|
|
||||||
Attributes:
|
|
||||||
prompt: The input text prompt for the model.
|
|
||||||
multi_modal_data: Optional dictionary containing multi-modal data (e.g.
|
|
||||||
images).
|
|
||||||
prompt_len: The length of the prompt in tokens.
|
|
||||||
expected_output_len: The expected length of the output in tokens.
|
|
||||||
"""
|
|
||||||
prompt: str
|
|
||||||
prompt_len: int
|
|
||||||
expected_output_len: int
|
|
||||||
schema: dict
|
|
||||||
structure_type: str = 'json'
|
|
||||||
completion: str = None
|
|
||||||
|
|
||||||
|
|
||||||
def run_vllm(requests: List[SampleRequest],
|
|
||||||
engine_args: EngineArgs,
|
|
||||||
n: int,
|
|
||||||
guided_decoding_rate: float = 1.0,
|
|
||||||
warmup: bool = False) -> float:
|
|
||||||
from vllm import LLM, SamplingParams
|
|
||||||
llm = LLM(**vars(engine_args))
|
|
||||||
|
|
||||||
# Add the requests to the engine.
|
|
||||||
prompts: List[str] = []
|
|
||||||
sampling_params: List[SamplingParams] = []
|
|
||||||
# create a list containing random selected true or false
|
|
||||||
guided_decoding_req_idx = random.sample(
|
|
||||||
range(len(requests)), int(len(requests) * guided_decoding_rate))
|
|
||||||
|
|
||||||
if warmup:
|
|
||||||
print(">>>>> Running warmup prompt, for the first 5")
|
|
||||||
# We setup the first 5 requests to warmup FSM
|
|
||||||
# if using xgrammar dataset, we will skip warmup
|
|
||||||
warmup_requests = requests[:5]
|
|
||||||
for i, request in enumerate(warmup_requests):
|
|
||||||
prompts.append(request.prompt)
|
|
||||||
sampling_params.append(
|
|
||||||
SamplingParams(
|
|
||||||
n=n,
|
|
||||||
temperature=1.0,
|
|
||||||
top_p=1.0,
|
|
||||||
ignore_eos=True,
|
|
||||||
max_tokens=request.expected_output_len,
|
|
||||||
guided_decoding=GuidedDecodingParams(json=request.schema)
|
|
||||||
if guided_decoding_rate > 0 else None,
|
|
||||||
))
|
|
||||||
llm.generate(prompts, sampling_params, use_tqdm=False)
|
|
||||||
|
|
||||||
print(">>>>> Benchmark started...")
|
|
||||||
prompts = []
|
|
||||||
sampling_params = []
|
|
||||||
for i, request in enumerate(requests):
|
|
||||||
prompts.append(request.prompt)
|
|
||||||
sampling_params.append(
|
|
||||||
SamplingParams(
|
|
||||||
n=n,
|
|
||||||
temperature=1.0,
|
|
||||||
top_p=1.0,
|
|
||||||
ignore_eos=True,
|
|
||||||
max_tokens=request.expected_output_len,
|
|
||||||
guided_decoding=GuidedDecodingParams(
|
|
||||||
**{request.structure_type: request.schema})
|
|
||||||
if i in guided_decoding_req_idx else None,
|
|
||||||
))
|
|
||||||
|
|
||||||
start = time.perf_counter()
|
|
||||||
outputs = llm.generate(prompts, sampling_params, use_tqdm=False)
|
|
||||||
ret = []
|
|
||||||
for output, request in zip(outputs, requests):
|
|
||||||
generated_text = output.outputs[0].text
|
|
||||||
ret.append({
|
|
||||||
"generated": generated_text,
|
|
||||||
"expected": request.completion
|
|
||||||
})
|
|
||||||
end = time.perf_counter()
|
|
||||||
return end - start, ret
|
|
||||||
|
|
||||||
|
|
||||||
async def run_vllm_async(
|
|
||||||
requests: List[SampleRequest],
|
|
||||||
engine_args: AsyncEngineArgs,
|
|
||||||
n: int,
|
|
||||||
guided_decoding_rate: float = 1.0,
|
|
||||||
warmup: bool = False,
|
|
||||||
disable_frontend_multiprocessing: bool = False) -> float:
|
|
||||||
from vllm import SamplingParams
|
|
||||||
|
|
||||||
async with build_async_engine_client_from_engine_args(
|
|
||||||
engine_args, disable_frontend_multiprocessing) as llm:
|
|
||||||
|
|
||||||
# Add the requests to the engine.
|
|
||||||
prompts: List[str] = []
|
|
||||||
sampling_params: List[SamplingParams] = []
|
|
||||||
guided_decoding_req_idx = random.sample(
|
|
||||||
range(len(requests)), int(len(requests) * guided_decoding_rate))
|
|
||||||
|
|
||||||
if warmup:
|
|
||||||
print(">>>>>> Running warmup prompt, for the first 5")
|
|
||||||
# We setup the first 5 requests to warmup FSM
|
|
||||||
# if using xgrammar dataset, we will skip warmup
|
|
||||||
warmup_requests = requests[:5]
|
|
||||||
for i, request in enumerate(warmup_requests):
|
|
||||||
prompts.append(request.prompt)
|
|
||||||
sampling_params.append(
|
|
||||||
SamplingParams(
|
|
||||||
n=n,
|
|
||||||
temperature=1.0,
|
|
||||||
top_p=1.0,
|
|
||||||
ignore_eos=True,
|
|
||||||
max_tokens=request.expected_output_len,
|
|
||||||
guided_decoding=GuidedDecodingParams(
|
|
||||||
json=request.schema)
|
|
||||||
if guided_decoding_rate > 0 else None,
|
|
||||||
))
|
|
||||||
generators = []
|
|
||||||
for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
|
|
||||||
generator = llm.generate(prompt, sp, request_id=f"test{i}")
|
|
||||||
generators.append(generator)
|
|
||||||
all_gens = merge_async_iterators(*generators)
|
|
||||||
async for i, res in all_gens:
|
|
||||||
pass
|
|
||||||
|
|
||||||
print(">>>>> Benchmark started...")
|
|
||||||
prompts = []
|
|
||||||
sampling_params = []
|
|
||||||
for i, request in enumerate(requests):
|
|
||||||
prompts.append(request.prompt)
|
|
||||||
sampling_params.append(
|
|
||||||
SamplingParams(
|
|
||||||
n=n,
|
|
||||||
temperature=1.0,
|
|
||||||
top_p=1.0,
|
|
||||||
ignore_eos=True,
|
|
||||||
max_tokens=request.expected_output_len,
|
|
||||||
guided_decoding=GuidedDecodingParams(json=request.schema)
|
|
||||||
if i in guided_decoding_req_idx else None,
|
|
||||||
))
|
|
||||||
|
|
||||||
generators = []
|
|
||||||
start_time = []
|
|
||||||
latencies = []
|
|
||||||
start = time.perf_counter()
|
|
||||||
for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
|
|
||||||
generator = llm.generate(prompt, sp, request_id=f"test{i}")
|
|
||||||
generators.append(generator)
|
|
||||||
start_time.append(time.perf_counter())
|
|
||||||
latencies.append([])
|
|
||||||
all_gens = merge_async_iterators(*generators)
|
|
||||||
generated_texts = [''] * len(requests)
|
|
||||||
async for i, res in all_gens:
|
|
||||||
generated_texts[i] = res.outputs[0].text
|
|
||||||
lat = time.perf_counter() - start_time[i]
|
|
||||||
latencies[i].append(lat)
|
|
||||||
ret = [{
|
|
||||||
'generated': gt,
|
|
||||||
'expected': req.completion
|
|
||||||
} for gt, req in zip(generated_texts, requests)]
|
|
||||||
end = time.perf_counter()
|
|
||||||
first_latency = pd.Series([lat[0] * 1000 for lat in latencies])
|
|
||||||
next_latency = pd.Series([(lat[-1] - lat[0]) / len(lat[1:]) * 1000
|
|
||||||
for lat in latencies])
|
|
||||||
return end - start, ret, (first_latency, next_latency)
|
|
||||||
|
|
||||||
|
|
||||||
def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
|
||||||
args: argparse.Namespace) -> List[SampleRequest]:
|
|
||||||
if args.dataset == 'json':
|
|
||||||
if args.json_schema_path is None:
|
|
||||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
|
||||||
args.json_schema_path = os.path.join(dir_path,
|
|
||||||
"structured_schemas",
|
|
||||||
"structured_schema_1.json")
|
|
||||||
with open(args.json_schema_path) as f:
|
|
||||||
schema = json.load(f)
|
|
||||||
prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501
|
|
||||||
input_len = len(tokenizer(prompt).input_ids)
|
|
||||||
print(f"Input length of the prompt: {input_len} tokens")
|
|
||||||
requests = [
|
|
||||||
SampleRequest(prompt=prompt,
|
|
||||||
prompt_len=input_len,
|
|
||||||
expected_output_len=args.output_len,
|
|
||||||
schema=schema,
|
|
||||||
structure_type=args.structure_type)
|
|
||||||
for _ in range(args.num_prompts)
|
|
||||||
]
|
|
||||||
|
|
||||||
elif args.dataset == "grammar":
|
|
||||||
schema = """
|
|
||||||
?start: select_statement
|
|
||||||
|
|
||||||
?select_statement: "SELECT " column_list " FROM " table_name
|
|
||||||
|
|
||||||
?column_list: column_name ("," column_name)*
|
|
||||||
|
|
||||||
?table_name: identifier
|
|
||||||
|
|
||||||
?column_name: identifier
|
|
||||||
|
|
||||||
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
|
|
||||||
"""
|
|
||||||
prompt = "Generate an SQL query to show the 'username' \
|
|
||||||
and 'email' from the 'users' table."
|
|
||||||
|
|
||||||
input_len = len(tokenizer(prompt).input_ids)
|
|
||||||
print(f"Input length of the prompt: {input_len} tokens")
|
|
||||||
requests = [
|
|
||||||
SampleRequest(prompt=prompt,
|
|
||||||
prompt_len=input_len,
|
|
||||||
expected_output_len=args.output_len,
|
|
||||||
schema=schema,
|
|
||||||
structure_type=args.structure_type)
|
|
||||||
for _ in range(args.num_prompts)
|
|
||||||
]
|
|
||||||
|
|
||||||
elif args.dataset == "regex":
|
|
||||||
regex = r"\w+@\w+\.com\n"
|
|
||||||
args.regex = regex
|
|
||||||
prompt = "Generate an email address for Alan Turing, \
|
|
||||||
who works in Enigma. End in .com and new line. \
|
|
||||||
Example result: alan.turing@enigma.com\n"
|
|
||||||
|
|
||||||
input_len = len(tokenizer(prompt).input_ids)
|
|
||||||
print(f"Input length of the prompt: {input_len} tokens")
|
|
||||||
requests = [
|
|
||||||
SampleRequest(prompt=prompt,
|
|
||||||
prompt_len=input_len,
|
|
||||||
expected_output_len=args.output_len,
|
|
||||||
schema=regex,
|
|
||||||
structure_type=args.structure_type)
|
|
||||||
for _ in range(args.num_prompts)
|
|
||||||
]
|
|
||||||
|
|
||||||
elif args.dataset == "choice":
|
|
||||||
choice = ["Positive", "Negative"]
|
|
||||||
args.choice = choice
|
|
||||||
prompt = "Classify this sentiment: vLLM is wonderful!"
|
|
||||||
input_len = len(tokenizer(prompt).input_ids)
|
|
||||||
print(f"Input length of the prompt: {input_len} tokens")
|
|
||||||
requests = [
|
|
||||||
SampleRequest(prompt=prompt,
|
|
||||||
prompt_len=input_len,
|
|
||||||
expected_output_len=args.output_len,
|
|
||||||
schema=choice,
|
|
||||||
structure_type=args.structure_type)
|
|
||||||
for _ in range(args.num_prompts)
|
|
||||||
]
|
|
||||||
|
|
||||||
elif args.dataset == "xgrammar_bench":
|
|
||||||
args.warmup = False
|
|
||||||
requests: List[SampleRequest] = []
|
|
||||||
dataset = datasets.load_dataset("NousResearch/json-mode-eval",
|
|
||||||
split="train")
|
|
||||||
print(f"dataset has {len(dataset)} entries")
|
|
||||||
len_dataset = len(dataset)
|
|
||||||
for data_point_idx in range(args.num_prompts):
|
|
||||||
idx = data_point_idx
|
|
||||||
while idx >= len_dataset:
|
|
||||||
idx -= len_dataset
|
|
||||||
schema = dataset["schema"][idx]
|
|
||||||
prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
|
|
||||||
tokenize=False)
|
|
||||||
input_len = len(tokenizer(prompt).input_ids)
|
|
||||||
completion = dataset["completion"][idx]
|
|
||||||
|
|
||||||
requests.append(
|
|
||||||
SampleRequest(prompt=prompt,
|
|
||||||
prompt_len=input_len,
|
|
||||||
expected_output_len=args.output_len,
|
|
||||||
schema=schema,
|
|
||||||
completion=completion))
|
|
||||||
|
|
||||||
return requests
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate(ret, args):
|
|
||||||
|
|
||||||
def _eval_correctness_json(expected, actual):
|
|
||||||
# extract json string from string using regex
|
|
||||||
import re
|
|
||||||
actual = actual.replace('\n', '').replace(' ', '').strip()
|
|
||||||
try:
|
|
||||||
actual = re.search(r'\{.*\}', actual).group()
|
|
||||||
actual = json.loads(actual)
|
|
||||||
except Exception:
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
def _eval_correctness_choice(expected, actual):
|
|
||||||
return actual in args.choice
|
|
||||||
|
|
||||||
def _eval_correctness_regex(expected, actual):
|
|
||||||
import re
|
|
||||||
return re.match(args.regex, actual) is not None
|
|
||||||
|
|
||||||
def _eval_correctness(expected, actual):
|
|
||||||
if args.structure_type == 'json':
|
|
||||||
return _eval_correctness_json(expected, actual)
|
|
||||||
elif args.structure_type == 'regex':
|
|
||||||
return _eval_correctness_regex(expected, actual)
|
|
||||||
elif args.structure_type == 'choice':
|
|
||||||
return _eval_correctness_choice(expected, actual)
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
scores = []
|
|
||||||
for res in ret:
|
|
||||||
score = _eval_correctness(res['expected'], res['generated'])
|
|
||||||
res['correctness'] = score
|
|
||||||
scores.append(score)
|
|
||||||
|
|
||||||
not_none_scores = [score for score in scores if score is not None]
|
|
||||||
|
|
||||||
return (sum(not_none_scores) / len(not_none_scores) *
|
|
||||||
100) if len(not_none_scores) > 0 else None
|
|
||||||
|
|
||||||
|
|
||||||
def main(args: argparse.Namespace):
|
|
||||||
print(args)
|
|
||||||
random.seed(args.seed)
|
|
||||||
|
|
||||||
# async engine is working for 'regex', 'choice' and 'grammar'
|
|
||||||
if args.dataset == 'grammar':
|
|
||||||
args.structure_type = 'grammar'
|
|
||||||
args.async_engine = False
|
|
||||||
elif args.dataset == 'regex':
|
|
||||||
args.structure_type = 'regex'
|
|
||||||
args.async_engine = False
|
|
||||||
elif args.dataset == 'choice':
|
|
||||||
args.structure_type = 'choice'
|
|
||||||
args.async_engine = False
|
|
||||||
else:
|
|
||||||
args.structure_type = 'json'
|
|
||||||
|
|
||||||
if args.no_guided_decoding:
|
|
||||||
args.guided_decoding_ratio = 0
|
|
||||||
if args.save_results:
|
|
||||||
result_file_name = f'{args.guided_decoding_ratio}guided'
|
|
||||||
result_file_name += f"_{args.model.split('/')[-1]}"
|
|
||||||
result_file_name += f"_{args.dataset}"
|
|
||||||
result_file_name += f"_{args.num_prompts}"
|
|
||||||
result_file_name += f"_out{args.output_len}"
|
|
||||||
result_file_name += f"_async{args.async_engine}"
|
|
||||||
result_file_name += f"_warmup{args.warmup}"
|
|
||||||
result_file_name += f"_chunkedprefill{args.enable_chunked_prefill}"
|
|
||||||
result_file_name += ".txt"
|
|
||||||
else:
|
|
||||||
result_file_name = None
|
|
||||||
|
|
||||||
# Synthesize a prompt with the given input length.
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
|
||||||
args.tokenizer, trust_remote_code=args.trust_remote_code)
|
|
||||||
requests = sample_requests(tokenizer, args)
|
|
||||||
|
|
||||||
if args.async_engine:
|
|
||||||
engine_args = AsyncEngineArgs.from_cli_args(args)
|
|
||||||
elapsed_time, ret, (first_latency, next_latency) = uvloop.run(
|
|
||||||
run_vllm_async(requests, engine_args, args.n,
|
|
||||||
args.guided_decoding_ratio, args.warmup,
|
|
||||||
args.disable_frontend_multiprocessing))
|
|
||||||
else:
|
|
||||||
engine_args = EngineArgs.from_cli_args(args)
|
|
||||||
elapsed_time, ret = run_vllm(requests, engine_args, args.n,
|
|
||||||
args.guided_decoding_ratio, args.warmup)
|
|
||||||
first_latency, next_latency = None, None
|
|
||||||
|
|
||||||
score = evaluate(ret, args)
|
|
||||||
total_num_tokens = sum(request.prompt_len + request.expected_output_len
|
|
||||||
for request in requests)
|
|
||||||
total_output_tokens = sum(request.expected_output_len
|
|
||||||
for request in requests)
|
|
||||||
if first_latency is not None:
|
|
||||||
latency_breakdown = "\nFirst token latency(msecs):\n"
|
|
||||||
latency_breakdown += f"{first_latency.describe()}"
|
|
||||||
latency_breakdown += "\nNext token latency(msecs):\n"
|
|
||||||
latency_breakdown += f"{next_latency.describe()}"
|
|
||||||
print(
|
|
||||||
f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
|
||||||
f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
|
|
||||||
f"{total_output_tokens / elapsed_time:.2f} output tokens/s",
|
|
||||||
f"Correct rate is {score} %",
|
|
||||||
f"{latency_breakdown if first_latency is not None else ''}")
|
|
||||||
|
|
||||||
# Output JSON results if specified
|
|
||||||
if args.output_json or result_file_name:
|
|
||||||
results = {
|
|
||||||
"elapsed_time": elapsed_time,
|
|
||||||
"num_requests": len(requests),
|
|
||||||
"total_num_tokens": total_num_tokens,
|
|
||||||
"total_output_tokens": total_output_tokens,
|
|
||||||
"requests_per_second": len(requests) / elapsed_time,
|
|
||||||
"tokens_per_second": f"{total_num_tokens / elapsed_time:.2f}",
|
|
||||||
"output_tokens_per_second":
|
|
||||||
f"{total_output_tokens / elapsed_time:.2f}",
|
|
||||||
"correct_rate(%)": score
|
|
||||||
}
|
|
||||||
results = {"outputs": ret, **results}
|
|
||||||
if first_latency is not None:
|
|
||||||
results["first_token_latency(msecs)"] = first_latency.describe(
|
|
||||||
).to_dict()
|
|
||||||
results["next_token_latency(msecs)"] = next_latency.describe(
|
|
||||||
).to_dict()
|
|
||||||
if args.output_json:
|
|
||||||
with open(args.output_json, "w") as f:
|
|
||||||
json.dump(results, f, indent=4)
|
|
||||||
elif result_file_name:
|
|
||||||
with open(result_file_name, "w") as f:
|
|
||||||
json.dump(results, f, indent=4)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = FlexibleArgumentParser(description="Benchmark guided decoding.")
|
|
||||||
parser = AsyncEngineArgs.add_cli_args(parser)
|
|
||||||
|
|
||||||
parser.add_argument("--output-len",
|
|
||||||
type=int,
|
|
||||||
default=512,
|
|
||||||
help="Output length for each request. Overrides the "
|
|
||||||
"output length from the dataset.")
|
|
||||||
parser.add_argument(
|
|
||||||
"--dataset",
|
|
||||||
default='json',
|
|
||||||
choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench'])
|
|
||||||
parser.add_argument("--json_schema_path",
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help="Path to json schema.")
|
|
||||||
parser.add_argument("--n",
|
|
||||||
type=int,
|
|
||||||
default=1,
|
|
||||||
help="Number of generated sequences per prompt.")
|
|
||||||
parser.add_argument("--num-prompts",
|
|
||||||
type=int,
|
|
||||||
default=10,
|
|
||||||
help="Number of prompts to process.")
|
|
||||||
parser.add_argument(
|
|
||||||
'--output-json',
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help='Path to save the throughput results in JSON format.')
|
|
||||||
parser.add_argument("--async-engine",
|
|
||||||
action='store_true',
|
|
||||||
default=False,
|
|
||||||
help="Use vLLM async engine rather than LLM class.")
|
|
||||||
parser.add_argument("--no-guided-decoding",
|
|
||||||
action='store_true',
|
|
||||||
default=False,
|
|
||||||
help="Whether to disable JSON decoding or not.")
|
|
||||||
parser.add_argument("--guided-decoding-ratio",
|
|
||||||
type=float,
|
|
||||||
default=1.0,
|
|
||||||
help="Ratio of Guided Decoding requests")
|
|
||||||
parser.add_argument("--disable-frontend-multiprocessing",
|
|
||||||
action='store_true',
|
|
||||||
default=False,
|
|
||||||
help="Disable decoupled async engine frontend.")
|
|
||||||
parser.add_argument("--warmup",
|
|
||||||
action="store_true",
|
|
||||||
default=False,
|
|
||||||
help="Run warmup prompts before benchmark.")
|
|
||||||
parser.add_argument("--save-results",
|
|
||||||
action="store_true",
|
|
||||||
default=False,
|
|
||||||
help="save output results.")
|
|
||||||
args = parser.parse_args()
|
|
||||||
if args.tokenizer is None:
|
|
||||||
args.tokenizer = args.model
|
|
||||||
main(args)
|
|
||||||
@ -1,21 +1,38 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
"""Benchmark the latency of processing a single batch of requests."""
|
"""Benchmark the latency of processing a single batch of requests."""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import dataclasses
|
import dataclasses
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List, Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.inputs import PromptType
|
from vllm.inputs import PromptType
|
||||||
|
from vllm.sampling_params import BeamSearchParams
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
|
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||||
|
results: dict[str, Any]) -> None:
|
||||||
|
pt_records = convert_to_pytorch_benchmark_format(
|
||||||
|
args=args,
|
||||||
|
metrics={"latency": results["latencies"]},
|
||||||
|
extra_info={k: results[k]
|
||||||
|
for k in ["avg_latency", "percentiles"]})
|
||||||
|
if pt_records:
|
||||||
|
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
|
||||||
|
write_to_json(pt_file, pt_records)
|
||||||
|
|
||||||
|
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
print(args)
|
print(args)
|
||||||
|
|
||||||
@ -24,6 +41,10 @@ def main(args: argparse.Namespace):
|
|||||||
# NOTE(woosuk): If the request cannot be processed in a single batch,
|
# NOTE(woosuk): If the request cannot be processed in a single batch,
|
||||||
# the engine will automatically process the request in multiple batches.
|
# the engine will automatically process the request in multiple batches.
|
||||||
llm = LLM(**dataclasses.asdict(engine_args))
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
|
assert llm.llm_engine.model_config.max_model_len >= (
|
||||||
|
args.input_len +
|
||||||
|
args.output_len), ("Please ensure that max_model_len is greater than"
|
||||||
|
" the sum of input_len and output_len.")
|
||||||
|
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
n=args.n,
|
n=args.n,
|
||||||
@ -31,15 +52,31 @@ def main(args: argparse.Namespace):
|
|||||||
top_p=1.0,
|
top_p=1.0,
|
||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
max_tokens=args.output_len,
|
max_tokens=args.output_len,
|
||||||
|
detokenize=not args.disable_detokenize,
|
||||||
)
|
)
|
||||||
print(sampling_params)
|
print(sampling_params)
|
||||||
dummy_prompt_token_ids = np.random.randint(10000,
|
dummy_prompt_token_ids = np.random.randint(10000,
|
||||||
size=(args.batch_size,
|
size=(args.batch_size,
|
||||||
args.input_len))
|
args.input_len))
|
||||||
dummy_prompts: List[PromptType] = [{
|
dummy_prompts: list[PromptType] = [{
|
||||||
"prompt_token_ids": batch
|
"prompt_token_ids": batch
|
||||||
} for batch in dummy_prompt_token_ids.tolist()]
|
} for batch in dummy_prompt_token_ids.tolist()]
|
||||||
|
|
||||||
|
def llm_generate():
|
||||||
|
if not args.use_beam_search:
|
||||||
|
llm.generate(dummy_prompts,
|
||||||
|
sampling_params=sampling_params,
|
||||||
|
use_tqdm=False)
|
||||||
|
else:
|
||||||
|
llm.beam_search(
|
||||||
|
dummy_prompts,
|
||||||
|
BeamSearchParams(
|
||||||
|
beam_width=args.n,
|
||||||
|
max_tokens=args.output_len,
|
||||||
|
ignore_eos=True,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
def run_to_completion(profile_dir: Optional[str] = None):
|
def run_to_completion(profile_dir: Optional[str] = None):
|
||||||
if profile_dir:
|
if profile_dir:
|
||||||
with torch.profiler.profile(
|
with torch.profiler.profile(
|
||||||
@ -48,16 +85,13 @@ def main(args: argparse.Namespace):
|
|||||||
torch.profiler.ProfilerActivity.CUDA,
|
torch.profiler.ProfilerActivity.CUDA,
|
||||||
],
|
],
|
||||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||||
str(profile_dir))) as p:
|
str(profile_dir)),
|
||||||
llm.generate(dummy_prompts,
|
) as p:
|
||||||
sampling_params=sampling_params,
|
llm_generate()
|
||||||
use_tqdm=False)
|
print(p.key_averages().table(sort_by="self_cuda_time_total"))
|
||||||
print(p.key_averages())
|
|
||||||
else:
|
else:
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
llm.generate(dummy_prompts,
|
llm_generate()
|
||||||
sampling_params=sampling_params,
|
|
||||||
use_tqdm=False)
|
|
||||||
end_time = time.perf_counter()
|
end_time = time.perf_counter()
|
||||||
latency = end_time - start_time
|
latency = end_time - start_time
|
||||||
return latency
|
return latency
|
||||||
@ -69,9 +103,8 @@ def main(args: argparse.Namespace):
|
|||||||
if args.profile:
|
if args.profile:
|
||||||
profile_dir = args.profile_result_dir
|
profile_dir = args.profile_result_dir
|
||||||
if not profile_dir:
|
if not profile_dir:
|
||||||
profile_dir = Path(
|
profile_dir = (Path(".") / "vllm_benchmark_result" /
|
||||||
"."
|
f"latency_result_{time.time()}")
|
||||||
) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
|
|
||||||
print(f"Profiling (results will be saved to '{profile_dir}')...")
|
print(f"Profiling (results will be saved to '{profile_dir}')...")
|
||||||
run_to_completion(profile_dir=profile_dir)
|
run_to_completion(profile_dir=profile_dir)
|
||||||
return
|
return
|
||||||
@ -83,9 +116,9 @@ def main(args: argparse.Namespace):
|
|||||||
latencies = np.array(latencies)
|
latencies = np.array(latencies)
|
||||||
percentages = [10, 25, 50, 75, 90, 99]
|
percentages = [10, 25, 50, 75, 90, 99]
|
||||||
percentiles = np.percentile(latencies, percentages)
|
percentiles = np.percentile(latencies, percentages)
|
||||||
print(f'Avg latency: {np.mean(latencies)} seconds')
|
print(f"Avg latency: {np.mean(latencies)} seconds")
|
||||||
for percentage, percentile in zip(percentages, percentiles):
|
for percentage, percentile in zip(percentages, percentiles):
|
||||||
print(f'{percentage}% percentile latency: {percentile} seconds')
|
print(f"{percentage}% percentile latency: {percentile} seconds")
|
||||||
|
|
||||||
# Output JSON results if specified
|
# Output JSON results if specified
|
||||||
if args.output_json:
|
if args.output_json:
|
||||||
@ -96,43 +129,57 @@ def main(args: argparse.Namespace):
|
|||||||
}
|
}
|
||||||
with open(args.output_json, "w") as f:
|
with open(args.output_json, "w") as f:
|
||||||
json.dump(results, f, indent=4)
|
json.dump(results, f, indent=4)
|
||||||
|
save_to_pytorch_benchmark_format(args, results)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description='Benchmark the latency of processing a single batch of '
|
description="Benchmark the latency of processing a single batch of "
|
||||||
'requests till completion.')
|
"requests till completion.")
|
||||||
parser.add_argument('--input-len', type=int, default=32)
|
parser.add_argument("--input-len", type=int, default=32)
|
||||||
parser.add_argument('--output-len', type=int, default=128)
|
parser.add_argument("--output-len", type=int, default=128)
|
||||||
parser.add_argument('--batch-size', type=int, default=8)
|
parser.add_argument("--batch-size", type=int, default=8)
|
||||||
parser.add_argument('--n',
|
parser.add_argument(
|
||||||
type=int,
|
"--n",
|
||||||
default=1,
|
type=int,
|
||||||
help='Number of generated sequences per prompt.')
|
default=1,
|
||||||
parser.add_argument('--use-beam-search', action='store_true')
|
help="Number of generated sequences per prompt.",
|
||||||
parser.add_argument('--num-iters-warmup',
|
)
|
||||||
type=int,
|
parser.add_argument("--use-beam-search", action="store_true")
|
||||||
default=10,
|
parser.add_argument(
|
||||||
help='Number of iterations to run for warmup.')
|
"--num-iters-warmup",
|
||||||
parser.add_argument('--num-iters',
|
type=int,
|
||||||
|
default=10,
|
||||||
|
help="Number of iterations to run for warmup.",
|
||||||
|
)
|
||||||
|
parser.add_argument("--num-iters",
|
||||||
type=int,
|
type=int,
|
||||||
default=30,
|
default=30,
|
||||||
help='Number of iterations to run.')
|
help="Number of iterations to run.")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--profile',
|
"--profile",
|
||||||
action='store_true',
|
action="store_true",
|
||||||
help='profile the generation process of a single batch')
|
help="profile the generation process of a single batch",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--profile-result-dir',
|
"--profile-result-dir",
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help=('path to save the pytorch profiler output. Can be visualized '
|
help=("path to save the pytorch profiler output. Can be visualized "
|
||||||
'with ui.perfetto.dev or Tensorboard.'))
|
"with ui.perfetto.dev or Tensorboard."),
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--output-json',
|
"--output-json",
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help='Path to save the latency results in JSON format.')
|
help="Path to save the latency results in JSON format.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--disable-detokenize",
|
||||||
|
action="store_true",
|
||||||
|
help=("Do not detokenize responses (i.e. do not include "
|
||||||
|
"detokenization time in the latency measurement)"),
|
||||||
|
)
|
||||||
|
|
||||||
parser = EngineArgs.add_cli_args(parser)
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|||||||
184
benchmarks/benchmark_long_document_qa_throughput.py
Normal file
184
benchmarks/benchmark_long_document_qa_throughput.py
Normal file
@ -0,0 +1,184 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
"""
|
||||||
|
Offline benchmark to test the long document QA throughput.
|
||||||
|
|
||||||
|
Example usage:
|
||||||
|
# This workload samples 8 different prompts with a default input
|
||||||
|
# length of 20000 tokens, then replicates each prompt 2 times
|
||||||
|
# in random order.
|
||||||
|
python benchmark_long_document_qa_throughput.py \
|
||||||
|
--model meta-llama/Llama-2-7b-chat-hf \
|
||||||
|
--enable-prefix-caching \
|
||||||
|
--num-documents 8 \
|
||||||
|
--repeat-count 2
|
||||||
|
|
||||||
|
Commandline arguments:
|
||||||
|
--num-documents: The number of documents to sample prompts from.
|
||||||
|
|
||||||
|
--document-length: The length of each document in tokens.
|
||||||
|
(Optional, default: 20000)
|
||||||
|
|
||||||
|
--output-len: The number of tokens to generate for each prompt.
|
||||||
|
(Optional, default: 10)
|
||||||
|
|
||||||
|
--repeat-count: The number of times to repeat each prompt.
|
||||||
|
(Optional, default: 2)
|
||||||
|
|
||||||
|
--repeat-mode: The mode to repeat prompts. The supported modes are:
|
||||||
|
- 'random': shuffle the prompts randomly. (Default)
|
||||||
|
- 'tile': the entire prompt list is repeated in sequence. (Potentially
|
||||||
|
lowest cache hit)
|
||||||
|
- 'interleave': each prompt is repeated consecutively before
|
||||||
|
moving to the next element. (Highest cache hit)
|
||||||
|
|
||||||
|
--shuffle-seed: Random seed when the repeat mode is "random".
|
||||||
|
(Optional, default: 0)
|
||||||
|
|
||||||
|
In the meantime, it also supports all the vLLM engine args to initialize the
|
||||||
|
LLM engine. You can refer to the `vllm.engine.arg_utils.EngineArgs` for more
|
||||||
|
details.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import dataclasses
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
|
def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
|
||||||
|
"""
|
||||||
|
Test long document QA with the given prompts and sampling parameters.
|
||||||
|
Print the time spent in processing all the prompts.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
llm: The language model used for generating responses.
|
||||||
|
sampling_params: Sampling parameter used to generate the response.
|
||||||
|
prompts: A list of prompt strings to be processed by the LLM.
|
||||||
|
"""
|
||||||
|
start_time = time.time()
|
||||||
|
llm.generate(prompts, sampling_params=sampling_params)
|
||||||
|
end_time = time.time()
|
||||||
|
print(f"Time to execute all requests: {end_time - start_time:.4f} secs")
|
||||||
|
|
||||||
|
|
||||||
|
def repeat_prompts(prompts, repeat_count, mode: str):
|
||||||
|
"""
|
||||||
|
Repeat each prompt in the list for a specified number of times.
|
||||||
|
The order of prompts in the output list depends on the mode.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prompts: A list of prompts to be repeated.
|
||||||
|
repeat_count: The number of times each prompt is repeated.
|
||||||
|
mode: The mode of repetition. Supported modes are:
|
||||||
|
- 'random': Shuffle the prompts randomly after repetition.
|
||||||
|
- 'tile': Repeat the entire prompt list in sequence.
|
||||||
|
Example: [1, 2, 3] -> [1, 2, 3, 1, 2, 3].
|
||||||
|
- 'interleave': Repeat each prompt consecutively before moving to
|
||||||
|
the next. Example: [1, 2, 3] -> [1, 1, 2, 2, 3, 3].
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A list of repeated prompts in the specified order.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If an invalid mode is provided.
|
||||||
|
"""
|
||||||
|
print("Repeat mode: ", mode)
|
||||||
|
if mode == 'random':
|
||||||
|
repeated_prompts = prompts * repeat_count
|
||||||
|
random.shuffle(repeated_prompts)
|
||||||
|
return repeated_prompts
|
||||||
|
elif mode == 'tile':
|
||||||
|
return prompts * repeat_count
|
||||||
|
elif mode == 'interleave':
|
||||||
|
repeated_prompts = []
|
||||||
|
for prompt in prompts:
|
||||||
|
repeated_prompts.extend([prompt] * repeat_count)
|
||||||
|
return repeated_prompts
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Invalid mode: {mode}, only support "
|
||||||
|
"'random', 'tile', 'interleave'")
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
random.seed(args.shuffle_seed)
|
||||||
|
|
||||||
|
# Prepare the prompts:
|
||||||
|
# we append the document id at the beginning to avoid any of the document
|
||||||
|
# being the prefix of other documents
|
||||||
|
prompts = [
|
||||||
|
str(i) + ' '.join(['hi'] * args.document_length)
|
||||||
|
for i in range(args.num_documents)
|
||||||
|
]
|
||||||
|
|
||||||
|
prompts = repeat_prompts(prompts, args.repeat_count, mode=args.repeat_mode)
|
||||||
|
|
||||||
|
warmup_prompts = [
|
||||||
|
"This is warm up request " + str(i) + \
|
||||||
|
' '.join(['hi'] * args.document_length)
|
||||||
|
for i in range(args.num_documents)]
|
||||||
|
|
||||||
|
# Create the LLM engine
|
||||||
|
engine_args = EngineArgs.from_cli_args(args)
|
||||||
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
|
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
|
||||||
|
|
||||||
|
print("------warm up------")
|
||||||
|
test_long_document_qa(
|
||||||
|
llm=llm,
|
||||||
|
prompts=warmup_prompts,
|
||||||
|
sampling_params=sampling_params,
|
||||||
|
)
|
||||||
|
|
||||||
|
print("------start generating------")
|
||||||
|
test_long_document_qa(
|
||||||
|
llm=llm,
|
||||||
|
prompts=prompts,
|
||||||
|
sampling_params=sampling_params,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = FlexibleArgumentParser(
|
||||||
|
description=
|
||||||
|
'Benchmark the performance with or without automatic prefix caching.')
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
'--document-length',
|
||||||
|
type=int,
|
||||||
|
# Roughly the number of tokens for a system paper,
|
||||||
|
# excluding images
|
||||||
|
default=20000,
|
||||||
|
help='Range of input lengths for sampling prompts,'
|
||||||
|
'specified as "min:max" (e.g., "128:256").')
|
||||||
|
|
||||||
|
parser.add_argument('--num-documents',
|
||||||
|
type=int,
|
||||||
|
default=8,
|
||||||
|
help='Range of input lengths for sampling prompts,'
|
||||||
|
'specified as "min:max" (e.g., "128:256").')
|
||||||
|
|
||||||
|
parser.add_argument('--output-len', type=int, default=10)
|
||||||
|
|
||||||
|
parser.add_argument('--repeat-count',
|
||||||
|
type=int,
|
||||||
|
default=2,
|
||||||
|
help='Number of times to repeat each prompt')
|
||||||
|
|
||||||
|
parser.add_argument("--repeat-mode",
|
||||||
|
type=str,
|
||||||
|
default='random',
|
||||||
|
help='The mode to repeat prompts. The supported '
|
||||||
|
'modes are "random", "tile", and "interleave". '
|
||||||
|
'See repeat_prompts() in the source code for details.')
|
||||||
|
|
||||||
|
parser.add_argument("--shuffle-seed",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help='Random seed when the repeat mode is "random"')
|
||||||
|
|
||||||
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args)
|
||||||
@ -1,3 +1,4 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
"""
|
"""
|
||||||
Benchmark the efficiency of prefix caching.
|
Benchmark the efficiency of prefix caching.
|
||||||
|
|
||||||
@ -10,7 +11,8 @@ Fixed example usage:
|
|||||||
--model meta-llama/Llama-2-7b-chat-hf \
|
--model meta-llama/Llama-2-7b-chat-hf \
|
||||||
--enable-prefix-caching \
|
--enable-prefix-caching \
|
||||||
--num-prompts 1 \
|
--num-prompts 1 \
|
||||||
--repeat-count 100
|
--repeat-count 100 \
|
||||||
|
--input-length-range 128:256
|
||||||
|
|
||||||
ShareGPT example usage:
|
ShareGPT example usage:
|
||||||
# This command samples 20 prompts with input lengths
|
# This command samples 20 prompts with input lengths
|
||||||
@ -29,7 +31,7 @@ import dataclasses
|
|||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
from typing import List, Optional, Tuple
|
from typing import Optional
|
||||||
|
|
||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
@ -75,9 +77,9 @@ def sample_requests_from_dataset(
|
|||||||
dataset_path: str,
|
dataset_path: str,
|
||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
input_length_range: Tuple[int, int],
|
input_length_range: tuple[int, int],
|
||||||
fixed_output_len: Optional[int],
|
fixed_output_len: Optional[int],
|
||||||
) -> List[Request]:
|
) -> list[Request]:
|
||||||
if fixed_output_len is not None and fixed_output_len < 4:
|
if fixed_output_len is not None and fixed_output_len < 4:
|
||||||
raise ValueError("output_len too small")
|
raise ValueError("output_len too small")
|
||||||
|
|
||||||
@ -97,7 +99,7 @@ def sample_requests_from_dataset(
|
|||||||
assert min_len >= 0 and max_len >= min_len, "input_length_range too small"
|
assert min_len >= 0 and max_len >= min_len, "input_length_range too small"
|
||||||
|
|
||||||
# Filter out sequences that are too long or too short
|
# Filter out sequences that are too long or too short
|
||||||
filtered_requests: List[Request] = []
|
filtered_requests: list[Request] = []
|
||||||
|
|
||||||
for i in range(len(dataset)):
|
for i in range(len(dataset)):
|
||||||
if len(filtered_requests) == num_requests:
|
if len(filtered_requests) == num_requests:
|
||||||
@ -120,10 +122,10 @@ def sample_requests_from_dataset(
|
|||||||
def sample_requests_from_random(
|
def sample_requests_from_random(
|
||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
input_length_range: Tuple[int, int],
|
input_length_range: tuple[int, int],
|
||||||
fixed_output_len: Optional[int],
|
fixed_output_len: Optional[int],
|
||||||
prefix_len: int,
|
prefix_len: int,
|
||||||
) -> List[Request]:
|
) -> list[Request]:
|
||||||
|
|
||||||
requests = []
|
requests = []
|
||||||
prefix_token_ids = sample_tokens(tokenizer, prefix_len)
|
prefix_token_ids = sample_tokens(tokenizer, prefix_len)
|
||||||
@ -142,9 +144,9 @@ def sample_requests_from_random(
|
|||||||
return requests
|
return requests
|
||||||
|
|
||||||
|
|
||||||
def repeat_and_sort_requests(requests: List[Request],
|
def repeat_and_sort_requests(requests: list[Request],
|
||||||
repeat_count: int,
|
repeat_count: int,
|
||||||
sort: bool = False) -> List[str]:
|
sort: bool = False) -> list[str]:
|
||||||
repeated_requests = requests * repeat_count
|
repeated_requests = requests * repeat_count
|
||||||
if sort:
|
if sort:
|
||||||
repeated_requests.sort(key=lambda x: x[1])
|
repeated_requests.sort(key=lambda x: x[1])
|
||||||
@ -192,7 +194,9 @@ def main(args):
|
|||||||
|
|
||||||
llm = LLM(**dataclasses.asdict(engine_args))
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
|
|
||||||
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
|
sampling_params = SamplingParams(temperature=0,
|
||||||
|
max_tokens=args.output_len,
|
||||||
|
detokenize=not args.disable_detokenize)
|
||||||
|
|
||||||
print("Testing filtered requests")
|
print("Testing filtered requests")
|
||||||
prompts = repeat_and_sort_requests(filtered_requests,
|
prompts = repeat_and_sort_requests(filtered_requests,
|
||||||
@ -241,6 +245,12 @@ if __name__ == "__main__":
|
|||||||
"subtract this length when filtering prompts. Only used "
|
"subtract this length when filtering prompts. Only used "
|
||||||
"when dataset-path is not provided.",
|
"when dataset-path is not provided.",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--disable-detokenize',
|
||||||
|
action='store_true',
|
||||||
|
help=("Do not detokenize responses (i.e. do not include "
|
||||||
|
"detokenization time in the latency measurement)"),
|
||||||
|
)
|
||||||
|
|
||||||
parser = EngineArgs.add_cli_args(parser)
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|||||||
@ -1,10 +1,11 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
"""Benchmark offline prioritization."""
|
"""Benchmark offline prioritization."""
|
||||||
import argparse
|
import argparse
|
||||||
import dataclasses
|
import dataclasses
|
||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
from typing import List, Optional, Tuple
|
from typing import Optional
|
||||||
|
|
||||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||||
|
|
||||||
@ -12,12 +13,17 @@ from vllm.engine.arg_utils import EngineArgs
|
|||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
|
#Select a equi-probable random priority
|
||||||
|
def get_random_flag():
|
||||||
|
return 0 if random.random() < 0.5 else 1
|
||||||
|
|
||||||
|
|
||||||
def sample_requests(
|
def sample_requests(
|
||||||
dataset_path: str,
|
dataset_path: str,
|
||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
fixed_output_len: Optional[int],
|
fixed_output_len: Optional[int],
|
||||||
) -> List[Tuple[str, int, int]]:
|
) -> list[tuple[str, int, int, int]]:
|
||||||
if fixed_output_len is not None and fixed_output_len < 4:
|
if fixed_output_len is not None and fixed_output_len < 4:
|
||||||
raise ValueError("output_len too small")
|
raise ValueError("output_len too small")
|
||||||
|
|
||||||
@ -34,7 +40,7 @@ def sample_requests(
|
|||||||
random.shuffle(dataset)
|
random.shuffle(dataset)
|
||||||
|
|
||||||
# Filter out sequences that are too long or too short
|
# Filter out sequences that are too long or too short
|
||||||
filtered_dataset: List[Tuple[str, int, int]] = []
|
filtered_dataset: list[tuple[str, int, int]] = []
|
||||||
for i in range(len(dataset)):
|
for i in range(len(dataset)):
|
||||||
if len(filtered_dataset) == num_requests:
|
if len(filtered_dataset) == num_requests:
|
||||||
break
|
break
|
||||||
@ -54,8 +60,7 @@ def sample_requests(
|
|||||||
# Prune too long sequences.
|
# Prune too long sequences.
|
||||||
continue
|
continue
|
||||||
|
|
||||||
#Select a equi-probable random priority
|
priority = get_random_flag()
|
||||||
priority = 0 if random.random() < 0.5 else 1
|
|
||||||
|
|
||||||
filtered_dataset.append((prompt, prompt_len, output_len, priority))
|
filtered_dataset.append((prompt, prompt_len, output_len, priority))
|
||||||
|
|
||||||
@ -63,13 +68,20 @@ def sample_requests(
|
|||||||
|
|
||||||
|
|
||||||
def run_vllm(
|
def run_vllm(
|
||||||
requests: List[Tuple[str, int, int]],
|
requests: list[tuple[str, int, int]],
|
||||||
n: int,
|
n: int,
|
||||||
engine_args: EngineArgs,
|
engine_args: EngineArgs,
|
||||||
|
disable_detokenize: bool = False,
|
||||||
) -> float:
|
) -> float:
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
llm = LLM(**dataclasses.asdict(engine_args))
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
|
|
||||||
|
assert all(
|
||||||
|
llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])
|
||||||
|
for request in requests), (
|
||||||
|
"Please ensure that max_model_len is greater than the sum of"
|
||||||
|
" input_len and output_len for all requests.")
|
||||||
|
|
||||||
# Add the requests to the engine.
|
# Add the requests to the engine.
|
||||||
prompts = []
|
prompts = []
|
||||||
sampling_params = []
|
sampling_params = []
|
||||||
@ -84,6 +96,7 @@ def run_vllm(
|
|||||||
top_p=1.0,
|
top_p=1.0,
|
||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
max_tokens=output_len,
|
max_tokens=output_len,
|
||||||
|
detokenize=not disable_detokenize,
|
||||||
))
|
))
|
||||||
|
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
@ -102,15 +115,16 @@ def main(args: argparse.Namespace):
|
|||||||
if args.dataset is None:
|
if args.dataset is None:
|
||||||
# Synthesize a prompt with the given input length.
|
# Synthesize a prompt with the given input length.
|
||||||
prompt = "hi" * (args.input_len - 1)
|
prompt = "hi" * (args.input_len - 1)
|
||||||
requests = [(prompt, args.input_len, args.output_len)
|
requests = [(prompt, args.input_len, args.output_len,
|
||||||
for _ in range(args.num_prompts)]
|
get_random_flag()) for _ in range(args.num_prompts)]
|
||||||
else:
|
else:
|
||||||
requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
|
requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
|
||||||
args.output_len)
|
args.output_len)
|
||||||
|
|
||||||
if args.backend == "vllm":
|
if args.backend == "vllm":
|
||||||
elapsed_time = run_vllm(requests, args.n,
|
elapsed_time = run_vllm(requests, args.n,
|
||||||
EngineArgs.from_cli_args(args))
|
EngineArgs.from_cli_args(args),
|
||||||
|
args.disable_detokenize)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown backend: {args.backend}")
|
raise ValueError(f"Unknown backend: {args.backend}")
|
||||||
total_num_tokens = sum(prompt_len + output_len
|
total_num_tokens = sum(prompt_len + output_len
|
||||||
@ -163,6 +177,12 @@ if __name__ == "__main__":
|
|||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help='Path to save the throughput results in JSON format.')
|
help='Path to save the throughput results in JSON format.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--disable-detokenize',
|
||||||
|
action='store_true',
|
||||||
|
help=("Do not detokenize responses (i.e. do not include "
|
||||||
|
"detokenization time in the latency measurement)"),
|
||||||
|
)
|
||||||
|
|
||||||
parser = EngineArgs.add_cli_args(parser)
|
parser = EngineArgs.add_cli_args(parser)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -1,19 +1,17 @@
|
|||||||
r"""Benchmark online serving throughput with guided decoding.
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
r"""Benchmark online serving throughput with structured outputs.
|
||||||
|
|
||||||
On the server side, run one of the following commands:
|
On the server side, run one of the following commands:
|
||||||
(vLLM OpenAI API server)
|
(vLLM OpenAI API server)
|
||||||
vllm serve <your_model> --disable-log-requests
|
vllm serve <your_model> --disable-log-requests
|
||||||
|
|
||||||
(TGI backend)
|
|
||||||
./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
|
|
||||||
|
|
||||||
On the client side, run:
|
On the client side, run:
|
||||||
python benchmarks/benchmark_serving.py \
|
python benchmarks/benchmark_serving_structured_output.py \
|
||||||
--backend <backend> \
|
--backend <backend> \
|
||||||
--model <your_model> \
|
--model <your_model> \
|
||||||
--dataset json \
|
--dataset json \
|
||||||
--guided-decoding-ratio 1.0 \
|
--structured-output-ratio 1.0 \
|
||||||
--guided-decoding-backend xgrammar \
|
--structured-output-backend xgrammar \
|
||||||
--request-rate 10 \
|
--request-rate 10 \
|
||||||
--num-prompts 1000
|
--num-prompts 1000
|
||||||
|
|
||||||
@ -23,14 +21,17 @@ On the client side, run:
|
|||||||
"""
|
"""
|
||||||
import argparse
|
import argparse
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import copy
|
||||||
import dataclasses
|
import dataclasses
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
|
import uuid
|
||||||
import warnings
|
import warnings
|
||||||
|
from collections.abc import AsyncGenerator
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import AsyncGenerator, List, Optional, Tuple
|
from typing import Optional
|
||||||
|
|
||||||
import datasets
|
import datasets
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -50,6 +51,9 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
from argparse import ArgumentParser as FlexibleArgumentParser
|
from argparse import ArgumentParser as FlexibleArgumentParser
|
||||||
|
|
||||||
|
from vllm.v1.structured_output.utils import (
|
||||||
|
has_xgrammar_unsupported_json_features)
|
||||||
|
|
||||||
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
|
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
|
||||||
|
|
||||||
|
|
||||||
@ -65,22 +69,22 @@ class BenchmarkMetrics:
|
|||||||
mean_ttft_ms: float
|
mean_ttft_ms: float
|
||||||
median_ttft_ms: float
|
median_ttft_ms: float
|
||||||
std_ttft_ms: float
|
std_ttft_ms: float
|
||||||
percentiles_ttft_ms: List[Tuple[float, float]]
|
percentiles_ttft_ms: list[tuple[float, float]]
|
||||||
mean_tpot_ms: float
|
mean_tpot_ms: float
|
||||||
median_tpot_ms: float
|
median_tpot_ms: float
|
||||||
std_tpot_ms: float
|
std_tpot_ms: float
|
||||||
percentiles_tpot_ms: List[Tuple[float, float]]
|
percentiles_tpot_ms: list[tuple[float, float]]
|
||||||
mean_itl_ms: float
|
mean_itl_ms: float
|
||||||
median_itl_ms: float
|
median_itl_ms: float
|
||||||
std_itl_ms: float
|
std_itl_ms: float
|
||||||
percentiles_itl_ms: List[Tuple[float, float]]
|
percentiles_itl_ms: list[tuple[float, float]]
|
||||||
# E2EL stands for end-to-end latency per request.
|
# E2EL stands for end-to-end latency per request.
|
||||||
# It is the time taken on the client side from sending
|
# It is the time taken on the client side from sending
|
||||||
# a request to receiving a complete response.
|
# a request to receiving a complete response.
|
||||||
mean_e2el_ms: float
|
mean_e2el_ms: float
|
||||||
median_e2el_ms: float
|
median_e2el_ms: float
|
||||||
std_e2el_ms: float
|
std_e2el_ms: float
|
||||||
percentiles_e2el_ms: List[Tuple[float, float]]
|
percentiles_e2el_ms: list[tuple[float, float]]
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass
|
@dataclasses.dataclass
|
||||||
@ -103,25 +107,44 @@ class SampleRequest:
|
|||||||
|
|
||||||
|
|
||||||
def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
||||||
args: argparse.Namespace) -> List[SampleRequest]:
|
args: argparse.Namespace) -> list[SampleRequest]:
|
||||||
if args.dataset == 'json':
|
if args.dataset == 'json' or args.dataset == 'json-unique':
|
||||||
if args.json_schema_path is None:
|
if args.json_schema_path is None:
|
||||||
dir_path = os.path.dirname(os.path.realpath(__file__))
|
dir_path = os.path.dirname(os.path.realpath(__file__))
|
||||||
args.json_schema_path = os.path.join(dir_path,
|
args.json_schema_path = os.path.join(dir_path,
|
||||||
"structured_schemas",
|
"structured_schemas",
|
||||||
"structured_schema_1.json")
|
"structured_schema_1.json")
|
||||||
|
json_schemas = []
|
||||||
with open(args.json_schema_path) as f:
|
with open(args.json_schema_path) as f:
|
||||||
schema = json.load(f)
|
schema = json.load(f)
|
||||||
prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501
|
|
||||||
input_len = len(tokenizer(prompt).input_ids)
|
if args.dataset == 'json-unique':
|
||||||
print(f"Input length of the prompt: {input_len} tokens")
|
json_schemas = [
|
||||||
|
copy.deepcopy(schema) for _ in range(args.num_prompts)
|
||||||
|
]
|
||||||
|
for i in range(len(json_schemas)):
|
||||||
|
json_schemas[i]["properties"][
|
||||||
|
f"__optional_field_{uuid.uuid4()}"] = {
|
||||||
|
"type":
|
||||||
|
"string",
|
||||||
|
"description":
|
||||||
|
"An unique optional field to avoid cached schemas"
|
||||||
|
}
|
||||||
|
|
||||||
|
def gen_prompt(index: int):
|
||||||
|
schema = json_schemas[index % len(json_schemas)]
|
||||||
|
return f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501
|
||||||
|
|
||||||
|
def get_schema(index: int):
|
||||||
|
return json_schemas[index % len(json_schemas)]
|
||||||
|
|
||||||
requests = [
|
requests = [
|
||||||
SampleRequest(prompt=prompt,
|
SampleRequest(prompt=gen_prompt(i),
|
||||||
prompt_len=input_len,
|
prompt_len=len(tokenizer(gen_prompt(i)).input_ids),
|
||||||
expected_output_len=args.output_len,
|
expected_output_len=args.output_len,
|
||||||
schema=schema,
|
schema=get_schema(i),
|
||||||
structure_type=args.structure_type)
|
structure_type=args.structure_type)
|
||||||
for _ in range(args.num_prompts)
|
for i in range(args.num_prompts)
|
||||||
]
|
]
|
||||||
|
|
||||||
elif args.dataset == "grammar":
|
elif args.dataset == "grammar":
|
||||||
@ -186,10 +209,20 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
|||||||
]
|
]
|
||||||
|
|
||||||
elif args.dataset == "xgrammar_bench":
|
elif args.dataset == "xgrammar_bench":
|
||||||
requests: List[SampleRequest] = []
|
requests: list[SampleRequest] = []
|
||||||
dataset = datasets.load_dataset("NousResearch/json-mode-eval",
|
dataset = datasets.load_dataset("NousResearch/json-mode-eval",
|
||||||
split="train")
|
split="train")
|
||||||
print(f"dataset has {len(dataset)} entries")
|
full_dataset_len = len(dataset)
|
||||||
|
|
||||||
|
def _filter_func(item):
|
||||||
|
import json
|
||||||
|
schema = json.loads(item["schema"])
|
||||||
|
return not has_xgrammar_unsupported_json_features(schema)
|
||||||
|
|
||||||
|
dataset = dataset.filter(_filter_func)
|
||||||
|
num_filtered_out = full_dataset_len - len(dataset)
|
||||||
|
print(f"dataset has {len(dataset)} entries after filtering "
|
||||||
|
f"out {num_filtered_out} entries with unsupported features")
|
||||||
len_dataset = len(dataset)
|
len_dataset = len(dataset)
|
||||||
for data_point_idx in range(args.num_prompts):
|
for data_point_idx in range(args.num_prompts):
|
||||||
idx = data_point_idx
|
idx = data_point_idx
|
||||||
@ -213,26 +246,26 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
|||||||
|
|
||||||
|
|
||||||
async def get_request(
|
async def get_request(
|
||||||
input_requests: List[SampleRequest],
|
input_requests: list[SampleRequest],
|
||||||
request_rate: float,
|
request_rate: float,
|
||||||
burstiness: float = 1.0,
|
burstiness: float = 1.0,
|
||||||
) -> AsyncGenerator[Tuple[int, SampleRequest], None]:
|
) -> AsyncGenerator[tuple[int, SampleRequest], None]:
|
||||||
"""
|
"""
|
||||||
Asynchronously generates requests at a specified rate
|
Asynchronously generates requests at a specified rate
|
||||||
with OPTIONAL burstiness.
|
with OPTIONAL burstiness.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
input_requests:
|
input_requests:
|
||||||
A list of input requests, each represented as a tuple.
|
A list of input requests, each represented as a tuple.
|
||||||
request_rate:
|
request_rate:
|
||||||
The rate at which requests are generated (requests/s).
|
The rate at which requests are generated (requests/s).
|
||||||
burstiness (optional):
|
burstiness (optional):
|
||||||
The burstiness factor of the request generation.
|
The burstiness factor of the request generation.
|
||||||
Only takes effect when request_rate is not inf.
|
Only takes effect when request_rate is not inf.
|
||||||
Default value is 1, which follows a Poisson process.
|
Default value is 1, which follows a Poisson process.
|
||||||
Otherwise, the request intervals follow a gamma distribution.
|
Otherwise, the request intervals follow a gamma distribution.
|
||||||
A lower burstiness value (0 < burstiness < 1) results
|
A lower burstiness value (0 < burstiness < 1) results
|
||||||
in more bursty requests, while a higher burstiness value
|
in more bursty requests, while a higher burstiness value
|
||||||
(burstiness > 1) results in a more uniform arrival of requests.
|
(burstiness > 1) results in a more uniform arrival of requests.
|
||||||
"""
|
"""
|
||||||
input_requests = iter(input_requests)
|
input_requests = iter(input_requests)
|
||||||
@ -257,22 +290,23 @@ async def get_request(
|
|||||||
|
|
||||||
|
|
||||||
def calculate_metrics(
|
def calculate_metrics(
|
||||||
input_requests: List[Tuple[str, int, int]],
|
input_requests: list[tuple[str, int, int]],
|
||||||
outputs: List[RequestFuncOutput],
|
outputs: list[RequestFuncOutput],
|
||||||
dur_s: float,
|
dur_s: float,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
selected_percentile_metrics: List[str],
|
selected_percentile_metrics: list[str],
|
||||||
selected_percentiles: List[float],
|
selected_percentiles: list[float],
|
||||||
) -> Tuple[BenchmarkMetrics, List[int]]:
|
goodput_config_dict: Optional[dict[str, float]] = None,
|
||||||
actual_output_lens: List[int] = []
|
) -> tuple[BenchmarkMetrics, list[int]]:
|
||||||
|
actual_output_lens: list[int] = []
|
||||||
total_input = 0
|
total_input = 0
|
||||||
completed = 0
|
completed = 0
|
||||||
good_completed = 0
|
good_completed = 0
|
||||||
itls: List[float] = []
|
itls: list[float] = []
|
||||||
tpots: List[float] = []
|
tpots: list[float] = []
|
||||||
all_tpots: List[float] = []
|
all_tpots: list[float] = []
|
||||||
ttfts: List[float] = []
|
ttfts: list[float] = []
|
||||||
e2els: List[float] = []
|
e2els: list[float] = []
|
||||||
for i in range(len(outputs)):
|
for i in range(len(outputs)):
|
||||||
if outputs[i].success:
|
if outputs[i].success:
|
||||||
# We use the tokenizer to count the number of output tokens for all
|
# We use the tokenizer to count the number of output tokens for all
|
||||||
@ -286,10 +320,10 @@ def calculate_metrics(
|
|||||||
total_input += input_requests[i].prompt_len
|
total_input += input_requests[i].prompt_len
|
||||||
tpot = 0
|
tpot = 0
|
||||||
if output_len > 1:
|
if output_len > 1:
|
||||||
tpot = (outputs[i].latency - outputs[i].ttft) / (output_len -
|
latency_minus_ttft = outputs[i].latency - outputs[i].ttft
|
||||||
1)
|
tpot = latency_minus_ttft / (output_len - 1)
|
||||||
tpots.append(tpot)
|
tpots.append(tpot)
|
||||||
outputs[i].tpot = sum(tpots) / len(tpots) if len(tpots) else 0
|
outputs[i].tpot = tpot
|
||||||
# Note: if output_len <= 1, we regard tpot as 0 for goodput
|
# Note: if output_len <= 1, we regard tpot as 0 for goodput
|
||||||
all_tpots.append(tpot)
|
all_tpots.append(tpot)
|
||||||
itls += outputs[i].itl
|
itls += outputs[i].itl
|
||||||
@ -299,6 +333,28 @@ def calculate_metrics(
|
|||||||
else:
|
else:
|
||||||
actual_output_lens.append(0)
|
actual_output_lens.append(0)
|
||||||
|
|
||||||
|
if goodput_config_dict:
|
||||||
|
valid_metrics = []
|
||||||
|
slo_values = []
|
||||||
|
|
||||||
|
if "ttft" in goodput_config_dict:
|
||||||
|
valid_metrics.append(ttfts)
|
||||||
|
slo_values.append(goodput_config_dict["ttft"] /
|
||||||
|
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||||
|
if "tpot" in goodput_config_dict:
|
||||||
|
valid_metrics.append(all_tpots)
|
||||||
|
slo_values.append(goodput_config_dict["tpot"] /
|
||||||
|
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||||
|
if "e2el" in goodput_config_dict:
|
||||||
|
valid_metrics.append(e2els)
|
||||||
|
slo_values.append(goodput_config_dict["e2el"] /
|
||||||
|
MILLISECONDS_TO_SECONDS_CONVERSION)
|
||||||
|
|
||||||
|
for req_metric in zip(*valid_metrics):
|
||||||
|
is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
|
||||||
|
if is_good_req:
|
||||||
|
good_completed += 1
|
||||||
|
|
||||||
if completed == 0:
|
if completed == 0:
|
||||||
warnings.warn(
|
warnings.warn(
|
||||||
"All requests failed. This is likely due to a misconfiguration "
|
"All requests failed. This is likely due to a misconfiguration "
|
||||||
@ -344,17 +400,18 @@ async def benchmark(
|
|||||||
base_url: str,
|
base_url: str,
|
||||||
model_id: str,
|
model_id: str,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
input_requests: List[SampleRequest],
|
input_requests: list[SampleRequest],
|
||||||
request_rate: float,
|
request_rate: float,
|
||||||
burstiness: float,
|
burstiness: float,
|
||||||
disable_tqdm: bool,
|
disable_tqdm: bool,
|
||||||
profile: bool,
|
profile: bool,
|
||||||
selected_percentile_metrics: List[str],
|
selected_percentile_metrics: list[str],
|
||||||
selected_percentiles: List[str],
|
selected_percentiles: list[str],
|
||||||
ignore_eos: bool,
|
ignore_eos: bool,
|
||||||
max_concurrency: Optional[int],
|
max_concurrency: Optional[int],
|
||||||
guided_decoding_ratio: float,
|
structured_output_ratio: float,
|
||||||
guided_decoding_backend: str,
|
structured_output_backend: str,
|
||||||
|
goodput_config_dict: Optional[dict[str, float]] = None,
|
||||||
):
|
):
|
||||||
if backend in ASYNC_REQUEST_FUNCS:
|
if backend in ASYNC_REQUEST_FUNCS:
|
||||||
request_func = ASYNC_REQUEST_FUNCS[backend]
|
request_func = ASYNC_REQUEST_FUNCS[backend]
|
||||||
@ -365,16 +422,18 @@ async def benchmark(
|
|||||||
extra_body = {}
|
extra_body = {}
|
||||||
# Add the schema to the extra_body
|
# Add the schema to the extra_body
|
||||||
extra_body[request.structure_type] = request.schema
|
extra_body[request.structure_type] = request.schema
|
||||||
# Add the specific guided_decoding_backend
|
# Add the specific structured_output_backend
|
||||||
extra_body["guided_decoding_backend"] = guided_decoding_backend
|
extra_body["guided_decoding_backend"] = structured_output_backend
|
||||||
return extra_body
|
return extra_body
|
||||||
|
|
||||||
print("Starting initial single prompt test run...")
|
print("Starting initial single prompt test run...")
|
||||||
guided_decoding_req_idx = random.sample(
|
structured_output_req_idx = random.sample(
|
||||||
range(len(input_requests)),
|
range(len(input_requests)),
|
||||||
int(len(input_requests) * guided_decoding_ratio))
|
int(len(input_requests) * structured_output_ratio))
|
||||||
|
|
||||||
test_request = input_requests[0]
|
test_request = input_requests[0]
|
||||||
|
test_req_extra_body = (prepare_extra_body(test_request)
|
||||||
|
if 0 in structured_output_req_idx else None)
|
||||||
test_input = RequestFuncInput(
|
test_input = RequestFuncInput(
|
||||||
model=model_id,
|
model=model_id,
|
||||||
prompt=test_request.prompt,
|
prompt=test_request.prompt,
|
||||||
@ -382,7 +441,7 @@ async def benchmark(
|
|||||||
prompt_len=test_request.prompt_len,
|
prompt_len=test_request.prompt_len,
|
||||||
output_len=test_request.expected_output_len,
|
output_len=test_request.expected_output_len,
|
||||||
ignore_eos=ignore_eos,
|
ignore_eos=ignore_eos,
|
||||||
extra_body=prepare_extra_body(test_request),
|
extra_body=test_req_extra_body,
|
||||||
)
|
)
|
||||||
test_output = await request_func(request_func_input=test_input)
|
test_output = await request_func(request_func_input=test_input)
|
||||||
if not test_output.success:
|
if not test_output.success:
|
||||||
@ -401,7 +460,7 @@ async def benchmark(
|
|||||||
prompt_len=test_request.prompt_len,
|
prompt_len=test_request.prompt_len,
|
||||||
output_len=test_request.expected_output_len,
|
output_len=test_request.expected_output_len,
|
||||||
ignore_eos=ignore_eos,
|
ignore_eos=ignore_eos,
|
||||||
extra_body=prepare_extra_body(test_request),
|
extra_body=test_req_extra_body,
|
||||||
)
|
)
|
||||||
profile_output = await request_func(request_func_input=profile_input)
|
profile_output = await request_func(request_func_input=profile_input)
|
||||||
if profile_output.success:
|
if profile_output.success:
|
||||||
@ -434,12 +493,12 @@ async def benchmark(
|
|||||||
pbar=pbar)
|
pbar=pbar)
|
||||||
|
|
||||||
benchmark_start_time = time.perf_counter()
|
benchmark_start_time = time.perf_counter()
|
||||||
tasks: List[asyncio.Task] = []
|
tasks: list[asyncio.Task] = []
|
||||||
expected: List[str] = []
|
expected: list[str] = []
|
||||||
async for i, request in get_request(input_requests, request_rate,
|
async for i, request in get_request(input_requests, request_rate,
|
||||||
burstiness):
|
burstiness):
|
||||||
extra_body = prepare_extra_body(
|
extra_body = prepare_extra_body(
|
||||||
request) if i in guided_decoding_req_idx else None
|
request) if i in structured_output_req_idx else None
|
||||||
request_func_input = RequestFuncInput(
|
request_func_input = RequestFuncInput(
|
||||||
model=model_id,
|
model=model_id,
|
||||||
prompt=request.prompt,
|
prompt=request.prompt,
|
||||||
@ -454,7 +513,7 @@ async def benchmark(
|
|||||||
asyncio.create_task(
|
asyncio.create_task(
|
||||||
limited_request_func(request_func_input=request_func_input,
|
limited_request_func(request_func_input=request_func_input,
|
||||||
pbar=pbar)))
|
pbar=pbar)))
|
||||||
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
|
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
if profile:
|
if profile:
|
||||||
print("Stopping profiler...")
|
print("Stopping profiler...")
|
||||||
@ -482,6 +541,7 @@ async def benchmark(
|
|||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
selected_percentile_metrics=selected_percentile_metrics,
|
selected_percentile_metrics=selected_percentile_metrics,
|
||||||
selected_percentiles=selected_percentiles,
|
selected_percentiles=selected_percentiles,
|
||||||
|
goodput_config_dict=goodput_config_dict,
|
||||||
)
|
)
|
||||||
|
|
||||||
print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
|
print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
|
||||||
@ -493,6 +553,9 @@ async def benchmark(
|
|||||||
metrics.total_output))
|
metrics.total_output))
|
||||||
print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
|
print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
|
||||||
metrics.request_throughput))
|
metrics.request_throughput))
|
||||||
|
if goodput_config_dict:
|
||||||
|
print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
|
||||||
|
metrics.request_goodput))
|
||||||
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
|
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
|
||||||
metrics.output_throughput))
|
metrics.output_throughput))
|
||||||
print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
|
print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
|
||||||
@ -616,6 +679,40 @@ def evaluate(ret, args):
|
|||||||
100) if len(not_none_scores) > 0 else None
|
100) if len(not_none_scores) > 0 else None
|
||||||
|
|
||||||
|
|
||||||
|
def parse_goodput(slo_pairs):
|
||||||
|
goodput_config_dict = {}
|
||||||
|
try:
|
||||||
|
for slo_pair in slo_pairs:
|
||||||
|
slo_name, slo_val = slo_pair.split(":")
|
||||||
|
goodput_config_dict[slo_name] = float(slo_val)
|
||||||
|
except ValueError as err:
|
||||||
|
raise argparse.ArgumentTypeError(
|
||||||
|
"Invalid format found for service level objectives. "
|
||||||
|
"Specify service level objectives for goodput as \"KEY:VALUE\" "
|
||||||
|
"pairs, where the key is a metric name, and the value is a "
|
||||||
|
"number in milliseconds.") from err
|
||||||
|
return goodput_config_dict
|
||||||
|
|
||||||
|
|
||||||
|
def check_goodput_args(args):
|
||||||
|
goodput_config_dict = {}
|
||||||
|
VALID_NAMES = ["ttft", "tpot", "e2el"]
|
||||||
|
if args.goodput:
|
||||||
|
goodput_config_dict = parse_goodput(args.goodput)
|
||||||
|
for slo_name, slo_val in goodput_config_dict.items():
|
||||||
|
if slo_name not in VALID_NAMES:
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid metric name found, {slo_name}: {slo_val}. "
|
||||||
|
"The service level objective name should be one of "
|
||||||
|
f"{str(VALID_NAMES)}. ")
|
||||||
|
if slo_val < 0:
|
||||||
|
raise ValueError(
|
||||||
|
f"Invalid value found, {slo_name}: {slo_val}. "
|
||||||
|
"The service level objective value should be "
|
||||||
|
"non-negative.")
|
||||||
|
return goodput_config_dict
|
||||||
|
|
||||||
|
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
print(args)
|
print(args)
|
||||||
random.seed(args.seed)
|
random.seed(args.seed)
|
||||||
@ -632,8 +729,11 @@ def main(args: argparse.Namespace):
|
|||||||
api_url = f"http://{args.host}:{args.port}{args.endpoint}"
|
api_url = f"http://{args.host}:{args.port}{args.endpoint}"
|
||||||
base_url = f"http://{args.host}:{args.port}"
|
base_url = f"http://{args.host}:{args.port}"
|
||||||
|
|
||||||
tokenizer = get_tokenizer(tokenizer_id,
|
tokenizer = get_tokenizer(
|
||||||
trust_remote_code=args.trust_remote_code)
|
tokenizer_id,
|
||||||
|
trust_remote_code=args.trust_remote_code,
|
||||||
|
tokenizer_mode=args.tokenizer_mode,
|
||||||
|
)
|
||||||
|
|
||||||
if args.dataset == 'grammar':
|
if args.dataset == 'grammar':
|
||||||
args.structure_type = 'guided_grammar'
|
args.structure_type = 'guided_grammar'
|
||||||
@ -644,10 +744,10 @@ def main(args: argparse.Namespace):
|
|||||||
else:
|
else:
|
||||||
args.structure_type = 'guided_json'
|
args.structure_type = 'guided_json'
|
||||||
|
|
||||||
if args.no_guided_decoding:
|
if args.no_structured_output:
|
||||||
args.guided_decoding_ratio = 0
|
args.structured_output_ratio = 0
|
||||||
if args.save_results:
|
if args.save_results:
|
||||||
result_file_name = f'{args.guided_decoding_ratio}guided'
|
result_file_name = f'{args.structured_output_ratio}guided'
|
||||||
result_file_name += f"_{backend}"
|
result_file_name += f"_{backend}"
|
||||||
result_file_name += f"_{args.request_rate}qps"
|
result_file_name += f"_{args.request_rate}qps"
|
||||||
result_file_name += f"_{args.model.split('/')[-1]}"
|
result_file_name += f"_{args.model.split('/')[-1]}"
|
||||||
@ -660,6 +760,8 @@ def main(args: argparse.Namespace):
|
|||||||
|
|
||||||
input_requests = sample_requests(tokenizer, args)
|
input_requests = sample_requests(tokenizer, args)
|
||||||
|
|
||||||
|
goodput_config_dict = check_goodput_args(args)
|
||||||
|
|
||||||
benchmark_result, ret = asyncio.run(
|
benchmark_result, ret = asyncio.run(
|
||||||
benchmark(
|
benchmark(
|
||||||
backend=backend,
|
backend=backend,
|
||||||
@ -678,8 +780,9 @@ def main(args: argparse.Namespace):
|
|||||||
],
|
],
|
||||||
ignore_eos=args.ignore_eos,
|
ignore_eos=args.ignore_eos,
|
||||||
max_concurrency=args.max_concurrency,
|
max_concurrency=args.max_concurrency,
|
||||||
guided_decoding_ratio=args.guided_decoding_ratio,
|
structured_output_ratio=args.structured_output_ratio,
|
||||||
guided_decoding_backend=args.guided_decoding_backend,
|
structured_output_backend=args.structured_output_backend,
|
||||||
|
goodput_config_dict=goodput_config_dict,
|
||||||
))
|
))
|
||||||
|
|
||||||
# Save config and results to json
|
# Save config and results to json
|
||||||
@ -730,7 +833,8 @@ if __name__ == "__main__":
|
|||||||
default=None,
|
default=None,
|
||||||
help="Server or API base url if not using http host and port.",
|
help="Server or API base url if not using http host and port.",
|
||||||
)
|
)
|
||||||
parser.add_argument("--host", type=str, default="localhost")
|
# Use 127.0.0.1 here instead of localhost to force the use of ipv4
|
||||||
|
parser.add_argument("--host", type=str, default="127.0.0.1")
|
||||||
parser.add_argument("--port", type=int, default=8000)
|
parser.add_argument("--port", type=int, default=8000)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--endpoint",
|
"--endpoint",
|
||||||
@ -738,10 +842,12 @@ if __name__ == "__main__":
|
|||||||
default="/v1/completions",
|
default="/v1/completions",
|
||||||
help="API endpoint.",
|
help="API endpoint.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument("--dataset",
|
||||||
"--dataset",
|
default='json',
|
||||||
default='json',
|
choices=[
|
||||||
choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench'])
|
'json', 'json-unique', 'grammar', 'regex',
|
||||||
|
'choice', 'xgrammar_bench'
|
||||||
|
])
|
||||||
parser.add_argument("--json_schema_path",
|
parser.add_argument("--json_schema_path",
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
@ -770,6 +876,13 @@ if __name__ == "__main__":
|
|||||||
help=
|
help=
|
||||||
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--tokenizer-mode",
|
||||||
|
type=str,
|
||||||
|
default="auto",
|
||||||
|
help=
|
||||||
|
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--num-prompts",
|
"--num-prompts",
|
||||||
type=int,
|
type=int,
|
||||||
@ -863,19 +976,32 @@ if __name__ == "__main__":
|
|||||||
"Default value is \"99\". "
|
"Default value is \"99\". "
|
||||||
"Use \"--percentile-metrics\" to select metrics.",
|
"Use \"--percentile-metrics\" to select metrics.",
|
||||||
)
|
)
|
||||||
parser.add_argument("--no-guided-decoding",
|
parser.add_argument(
|
||||||
|
"--goodput",
|
||||||
|
nargs="+",
|
||||||
|
required=False,
|
||||||
|
help="Specify service level objectives for goodput as \"KEY:VALUE\" "
|
||||||
|
"pairs, where the key is a metric name, and the value is in "
|
||||||
|
"milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
|
||||||
|
"separated by spaces. Allowed request level metric names are "
|
||||||
|
"\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
|
||||||
|
"goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
|
||||||
|
"and the blog: https://hao-ai-lab.github.io/blogs/distserve")
|
||||||
|
|
||||||
|
parser.add_argument("--no-structured-output",
|
||||||
action='store_true',
|
action='store_true',
|
||||||
default=False,
|
default=False,
|
||||||
help="Whether to disable JSON decoding or not.")
|
help="Whether to disable JSON decoding or not.")
|
||||||
parser.add_argument("--guided-decoding-ratio",
|
parser.add_argument("--structured-output-ratio",
|
||||||
type=float,
|
type=float,
|
||||||
default=1.0,
|
default=1.0,
|
||||||
help="Ratio of Guided Decoding requests")
|
help="Ratio of Structured Outputs requests")
|
||||||
parser.add_argument("--guided-decoding-backend",
|
parser.add_argument(
|
||||||
type=str,
|
"--structured-output-backend",
|
||||||
choices=["outlines", "lm-format-enforcer", "xgrammar"],
|
type=str,
|
||||||
default="xgrammar",
|
choices=["outlines", "lm-format-enforcer", "xgrammar", "guidance"],
|
||||||
help="Backend to use for guided decoding")
|
default="xgrammar",
|
||||||
|
help="Backend to use for structured outputs")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
@ -1,15 +1,21 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
"""Benchmark offline inference throughput."""
|
"""Benchmark offline inference throughput."""
|
||||||
import argparse
|
import argparse
|
||||||
import dataclasses
|
import dataclasses
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
from functools import cache
|
import warnings
|
||||||
from typing import Dict, List, Optional, Tuple
|
from typing import Any, Optional, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import uvloop
|
import uvloop
|
||||||
from PIL import Image
|
from benchmark_dataset import (BurstGPTDataset, ConversationDataset,
|
||||||
|
InstructCoderDataset, RandomDataset,
|
||||||
|
SampleRequest, ShareGPTDataset, SonnetDataset,
|
||||||
|
VisionArenaDataset)
|
||||||
|
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
||||||
PreTrainedTokenizerBase)
|
PreTrainedTokenizerBase)
|
||||||
@ -17,163 +23,35 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
|||||||
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
|
||||||
from vllm.entrypoints.openai.api_server import (
|
from vllm.entrypoints.openai.api_server import (
|
||||||
build_async_engine_client_from_engine_args)
|
build_async_engine_client_from_engine_args)
|
||||||
from vllm.inputs import TextPrompt
|
from vllm.inputs import TextPrompt, TokensPrompt
|
||||||
from vllm.lora.request import LoRARequest
|
from vllm.lora.request import LoRARequest
|
||||||
from vllm.lora.utils import get_adapter_absolute_path
|
from vllm.outputs import RequestOutput
|
||||||
from vllm.multimodal import MultiModalDataDict
|
|
||||||
from vllm.sampling_params import BeamSearchParams
|
from vllm.sampling_params import BeamSearchParams
|
||||||
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
|
|
||||||
from vllm.utils import FlexibleArgumentParser, merge_async_iterators
|
from vllm.utils import FlexibleArgumentParser, merge_async_iterators
|
||||||
|
|
||||||
|
|
||||||
@dataclasses.dataclass
|
|
||||||
class SampleRequest:
|
|
||||||
"""A class representing a single inference request for benchmarking.
|
|
||||||
|
|
||||||
Attributes:
|
|
||||||
prompt: The input text prompt for the model.
|
|
||||||
prompt_len: The length of the prompt in tokens.
|
|
||||||
expected_output_len: The expected length of the output in tokens.
|
|
||||||
multi_modal_data: Optional dictionary containing multi-modal data (e.g.
|
|
||||||
images).
|
|
||||||
lora_request: Optional LoRARequest specifying the LoRA to use.
|
|
||||||
"""
|
|
||||||
prompt: str
|
|
||||||
prompt_len: int
|
|
||||||
expected_output_len: int
|
|
||||||
multi_modal_data: Optional[MultiModalDataDict] = None
|
|
||||||
lora_request: Optional[LoRARequest] = None
|
|
||||||
|
|
||||||
|
|
||||||
def _get_prompt_for_image_model(question: str, *, model: str) -> str:
|
|
||||||
"""Prepend and append special tokens around the question to form a prompt.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
question: The input question text to wrap with special tokens
|
|
||||||
model: The name of the model being used, to determine which special
|
|
||||||
tokens to add
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The formatted prompt string with appropriate special tokens for the
|
|
||||||
model
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
ValueError: If an unsupported model name is provided
|
|
||||||
"""
|
|
||||||
model = model.lower()
|
|
||||||
if "pixtral" in model:
|
|
||||||
return f"<s>[INST]{question}\n[IMG][/INST]"
|
|
||||||
raise ValueError(f"Unsupported model {model}")
|
|
||||||
|
|
||||||
|
|
||||||
@cache
|
|
||||||
def lora_path_on_disk(lora_path: str) -> str:
|
|
||||||
return get_adapter_absolute_path(lora_path)
|
|
||||||
|
|
||||||
|
|
||||||
lora_tokenizer_cache: Dict[int, AnyTokenizer] = {}
|
|
||||||
|
|
||||||
|
|
||||||
def get_random_lora_request(
|
|
||||||
args: argparse.Namespace
|
|
||||||
) -> Tuple[LoRARequest, Optional[AnyTokenizer]]:
|
|
||||||
global lora_tokenizer_cache
|
|
||||||
lora_id = random.randint(1, args.max_loras)
|
|
||||||
lora_request = LoRARequest(lora_name=str(lora_id),
|
|
||||||
lora_int_id=lora_id,
|
|
||||||
lora_path=lora_path_on_disk(args.lora_path))
|
|
||||||
if lora_id not in lora_tokenizer_cache:
|
|
||||||
lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
|
|
||||||
return lora_request, lora_tokenizer_cache[lora_id]
|
|
||||||
|
|
||||||
|
|
||||||
def sample_requests(tokenizer: PreTrainedTokenizerBase,
|
|
||||||
args: argparse.Namespace) -> List[SampleRequest]:
|
|
||||||
|
|
||||||
dataset_path: str = args.dataset
|
|
||||||
num_requests: int = args.num_prompts
|
|
||||||
fixed_output_len: Optional[int] = args.output_len
|
|
||||||
model: str = args.model
|
|
||||||
if fixed_output_len is not None and fixed_output_len < 4:
|
|
||||||
raise ValueError("output_len too small")
|
|
||||||
|
|
||||||
# Load the dataset.
|
|
||||||
with open(dataset_path) as f:
|
|
||||||
dataset = json.load(f)
|
|
||||||
# Filter out the conversations with less than 2 turns.
|
|
||||||
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
|
||||||
# Shuffle the dataset.
|
|
||||||
random.shuffle(dataset)
|
|
||||||
|
|
||||||
# Filter out sequences that are too long or too short
|
|
||||||
filtered_dataset: List[SampleRequest] = []
|
|
||||||
for data in tqdm(dataset,
|
|
||||||
total=len(filtered_dataset),
|
|
||||||
desc="sampling requests"):
|
|
||||||
if len(filtered_dataset) == num_requests:
|
|
||||||
break
|
|
||||||
|
|
||||||
# Only keep the first two turns of each conversation.
|
|
||||||
prompt = data["conversations"][0]["value"]
|
|
||||||
completion = data["conversations"][1]["value"]
|
|
||||||
|
|
||||||
multi_modal_data: Optional[MultiModalDataDict] = None
|
|
||||||
if "image" in data:
|
|
||||||
multi_modal_data = multi_modal_data or {}
|
|
||||||
image_path = data["image"]
|
|
||||||
# TODO(vllm-project/vllm/issues/9778): Support multiple images.
|
|
||||||
assert isinstance(image_path,
|
|
||||||
str), "Only support single image input"
|
|
||||||
try:
|
|
||||||
multi_modal_data["image"] = Image.open(image_path).convert(
|
|
||||||
"RGB")
|
|
||||||
except FileNotFoundError:
|
|
||||||
# Ignore datapoint where asset is missing
|
|
||||||
continue
|
|
||||||
prompt = _get_prompt_for_image_model(question=prompt, model=model)
|
|
||||||
|
|
||||||
request_tokenizer = tokenizer
|
|
||||||
lora_request: Optional[LoRARequest] = None
|
|
||||||
if args.enable_lora:
|
|
||||||
lora_request, lora_tokenizer = get_random_lora_request(args)
|
|
||||||
if lora_tokenizer:
|
|
||||||
request_tokenizer = lora_tokenizer
|
|
||||||
|
|
||||||
# Tokenize the prompts and completions.
|
|
||||||
prompt_token_ids = request_tokenizer(prompt).input_ids
|
|
||||||
completion_token_ids = request_tokenizer(completion).input_ids
|
|
||||||
prompt_len = len(prompt_token_ids)
|
|
||||||
output_len = len(completion_token_ids
|
|
||||||
) if fixed_output_len is None else fixed_output_len
|
|
||||||
if prompt_len < 4 or output_len < 4:
|
|
||||||
# Prune too short sequences.
|
|
||||||
continue
|
|
||||||
if prompt_len > 1024 or prompt_len + output_len > 2048:
|
|
||||||
# Prune too long sequences.
|
|
||||||
continue
|
|
||||||
filtered_dataset.append(
|
|
||||||
SampleRequest(prompt=prompt,
|
|
||||||
prompt_len=prompt_len,
|
|
||||||
expected_output_len=output_len,
|
|
||||||
multi_modal_data=multi_modal_data,
|
|
||||||
lora_request=lora_request))
|
|
||||||
|
|
||||||
return filtered_dataset
|
|
||||||
|
|
||||||
|
|
||||||
def run_vllm(
|
def run_vllm(
|
||||||
requests: List[SampleRequest],
|
requests: list[SampleRequest],
|
||||||
n: int,
|
n: int,
|
||||||
engine_args: EngineArgs,
|
engine_args: EngineArgs,
|
||||||
) -> float:
|
disable_detokenize: bool = False,
|
||||||
|
) -> tuple[float, Optional[list[RequestOutput]]]:
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
llm = LLM(**dataclasses.asdict(engine_args))
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
|
assert all(
|
||||||
|
llm.llm_engine.model_config.max_model_len >= (
|
||||||
|
request.prompt_len + request.expected_output_len)
|
||||||
|
for request in requests), (
|
||||||
|
"Please ensure that max_model_len is greater than the sum of"
|
||||||
|
" prompt_len and expected_output_len for all requests.")
|
||||||
# Add the requests to the engine.
|
# Add the requests to the engine.
|
||||||
prompts: List[TextPrompt] = []
|
prompts: list[Union[TextPrompt, TokensPrompt]] = []
|
||||||
sampling_params: List[SamplingParams] = []
|
sampling_params: list[SamplingParams] = []
|
||||||
for request in requests:
|
for request in requests:
|
||||||
prompts.append(
|
prompts.append(
|
||||||
|
TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
|
||||||
|
multi_modal_data=request.multi_modal_data)
|
||||||
|
if "prompt_token_ids" in request.prompt else \
|
||||||
TextPrompt(prompt=request.prompt,
|
TextPrompt(prompt=request.prompt,
|
||||||
multi_modal_data=request.multi_modal_data))
|
multi_modal_data=request.multi_modal_data))
|
||||||
sampling_params.append(
|
sampling_params.append(
|
||||||
@ -183,19 +61,21 @@ def run_vllm(
|
|||||||
top_p=1.0,
|
top_p=1.0,
|
||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
max_tokens=request.expected_output_len,
|
max_tokens=request.expected_output_len,
|
||||||
|
detokenize=not disable_detokenize,
|
||||||
))
|
))
|
||||||
lora_requests: Optional[List[LoRARequest]] = None
|
lora_requests: Optional[list[LoRARequest]] = None
|
||||||
if engine_args.enable_lora:
|
if engine_args.enable_lora:
|
||||||
lora_requests = [request.lora_request for request in requests]
|
lora_requests = [request.lora_request for request in requests]
|
||||||
|
|
||||||
use_beam_search = False
|
use_beam_search = False
|
||||||
|
|
||||||
|
outputs = None
|
||||||
if not use_beam_search:
|
if not use_beam_search:
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
llm.generate(prompts,
|
outputs = llm.generate(prompts,
|
||||||
sampling_params,
|
sampling_params,
|
||||||
lora_request=lora_requests,
|
lora_request=lora_requests,
|
||||||
use_tqdm=True)
|
use_tqdm=True)
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
else:
|
else:
|
||||||
assert lora_requests is None, "BeamSearch API does not support LoRA"
|
assert lora_requests is None, "BeamSearch API does not support LoRA"
|
||||||
@ -213,26 +93,75 @@ def run_vllm(
|
|||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
))
|
))
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
return end - start
|
return end - start, outputs
|
||||||
|
|
||||||
|
|
||||||
|
def run_vllm_chat(
|
||||||
|
requests: list[SampleRequest],
|
||||||
|
n: int,
|
||||||
|
engine_args: EngineArgs,
|
||||||
|
disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]:
|
||||||
|
"""
|
||||||
|
Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
|
||||||
|
multimodal models as it properly handles multimodal inputs and chat
|
||||||
|
formatting. For non-multimodal models, use run_vllm() instead.
|
||||||
|
"""
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
llm = LLM(**dataclasses.asdict(engine_args))
|
||||||
|
|
||||||
|
assert all(
|
||||||
|
llm.llm_engine.model_config.max_model_len >= (
|
||||||
|
request.prompt_len + request.expected_output_len)
|
||||||
|
for request in requests), (
|
||||||
|
"Please ensure that max_model_len is greater than the sum of "
|
||||||
|
"prompt_len and expected_output_len for all requests.")
|
||||||
|
|
||||||
|
prompts = []
|
||||||
|
sampling_params: list[SamplingParams] = []
|
||||||
|
for request in requests:
|
||||||
|
prompts.append(request.prompt)
|
||||||
|
sampling_params.append(
|
||||||
|
SamplingParams(
|
||||||
|
n=n,
|
||||||
|
temperature=1.0,
|
||||||
|
top_p=1.0,
|
||||||
|
ignore_eos=True,
|
||||||
|
max_tokens=request.expected_output_len,
|
||||||
|
detokenize=not disable_detokenize,
|
||||||
|
))
|
||||||
|
start = time.perf_counter()
|
||||||
|
outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
|
||||||
|
end = time.perf_counter()
|
||||||
|
return end - start, outputs
|
||||||
|
|
||||||
|
|
||||||
async def run_vllm_async(
|
async def run_vllm_async(
|
||||||
requests: List[SampleRequest],
|
requests: list[SampleRequest],
|
||||||
n: int,
|
n: int,
|
||||||
engine_args: AsyncEngineArgs,
|
engine_args: AsyncEngineArgs,
|
||||||
disable_frontend_multiprocessing: bool = False,
|
disable_frontend_multiprocessing: bool = False,
|
||||||
|
disable_detokenize: bool = False,
|
||||||
) -> float:
|
) -> float:
|
||||||
from vllm import SamplingParams
|
from vllm import SamplingParams
|
||||||
|
|
||||||
async with build_async_engine_client_from_engine_args(
|
async with build_async_engine_client_from_engine_args(
|
||||||
engine_args, disable_frontend_multiprocessing) as llm:
|
engine_args, disable_frontend_multiprocessing) as llm:
|
||||||
|
assert all(
|
||||||
|
llm.model_config.max_model_len >= (request.prompt_len +
|
||||||
|
request.expected_output_len)
|
||||||
|
for request in requests), (
|
||||||
|
"Please ensure that max_model_len is greater than the sum of"
|
||||||
|
" prompt_len and expected_output_len for all requests.")
|
||||||
|
|
||||||
# Add the requests to the engine.
|
# Add the requests to the engine.
|
||||||
prompts: List[TextPrompt] = []
|
prompts: list[Union[TextPrompt, TokensPrompt]] = []
|
||||||
sampling_params: List[SamplingParams] = []
|
sampling_params: list[SamplingParams] = []
|
||||||
lora_requests: List[Optional[LoRARequest]] = []
|
lora_requests: list[Optional[LoRARequest]] = []
|
||||||
for request in requests:
|
for request in requests:
|
||||||
prompts.append(
|
prompts.append(
|
||||||
|
TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
|
||||||
|
multi_modal_data=request.multi_modal_data)
|
||||||
|
if "prompt_token_ids" in request.prompt else \
|
||||||
TextPrompt(prompt=request.prompt,
|
TextPrompt(prompt=request.prompt,
|
||||||
multi_modal_data=request.multi_modal_data))
|
multi_modal_data=request.multi_modal_data))
|
||||||
sampling_params.append(
|
sampling_params.append(
|
||||||
@ -242,6 +171,7 @@ async def run_vllm_async(
|
|||||||
top_p=1.0,
|
top_p=1.0,
|
||||||
ignore_eos=True,
|
ignore_eos=True,
|
||||||
max_tokens=request.expected_output_len,
|
max_tokens=request.expected_output_len,
|
||||||
|
detokenize=not disable_detokenize,
|
||||||
))
|
))
|
||||||
lora_requests.append(request.lora_request)
|
lora_requests.append(request.lora_request)
|
||||||
|
|
||||||
@ -262,12 +192,13 @@ async def run_vllm_async(
|
|||||||
|
|
||||||
|
|
||||||
def run_hf(
|
def run_hf(
|
||||||
requests: List[SampleRequest],
|
requests: list[SampleRequest],
|
||||||
model: str,
|
model: str,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
n: int,
|
n: int,
|
||||||
max_batch_size: int,
|
max_batch_size: int,
|
||||||
trust_remote_code: bool,
|
trust_remote_code: bool,
|
||||||
|
disable_detokenize: bool = False,
|
||||||
) -> float:
|
) -> float:
|
||||||
llm = AutoModelForCausalLM.from_pretrained(
|
llm = AutoModelForCausalLM.from_pretrained(
|
||||||
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
|
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
|
||||||
@ -278,7 +209,7 @@ def run_hf(
|
|||||||
|
|
||||||
pbar = tqdm(total=len(requests))
|
pbar = tqdm(total=len(requests))
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
batch: List[str] = []
|
batch: list[str] = []
|
||||||
max_prompt_len = 0
|
max_prompt_len = 0
|
||||||
max_output_len = 0
|
max_output_len = 0
|
||||||
for i in range(len(requests)):
|
for i in range(len(requests)):
|
||||||
@ -307,8 +238,9 @@ def run_hf(
|
|||||||
use_cache=True,
|
use_cache=True,
|
||||||
max_new_tokens=max_output_len,
|
max_new_tokens=max_output_len,
|
||||||
)
|
)
|
||||||
# Include the decoding time.
|
if not disable_detokenize:
|
||||||
tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
|
# Include the decoding time.
|
||||||
|
tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
|
||||||
pbar.update(len(batch))
|
pbar.update(len(batch))
|
||||||
|
|
||||||
# Clear the batch.
|
# Clear the batch.
|
||||||
@ -320,7 +252,7 @@ def run_hf(
|
|||||||
|
|
||||||
|
|
||||||
def run_mii(
|
def run_mii(
|
||||||
requests: List[SampleRequest],
|
requests: list[SampleRequest],
|
||||||
model: str,
|
model: str,
|
||||||
tensor_parallel_size: int,
|
tensor_parallel_size: int,
|
||||||
output_len: int,
|
output_len: int,
|
||||||
@ -337,58 +269,89 @@ def run_mii(
|
|||||||
return end - start
|
return end - start
|
||||||
|
|
||||||
|
|
||||||
|
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||||
|
results: dict[str, Any]) -> None:
|
||||||
|
pt_records = convert_to_pytorch_benchmark_format(
|
||||||
|
args=args,
|
||||||
|
metrics={
|
||||||
|
"requests_per_second": [results["requests_per_second"]],
|
||||||
|
"tokens_per_second": [results["tokens_per_second"]],
|
||||||
|
},
|
||||||
|
extra_info={
|
||||||
|
k: results[k]
|
||||||
|
for k in ["elapsed_time", "num_requests", "total_num_tokens"]
|
||||||
|
})
|
||||||
|
if pt_records:
|
||||||
|
# Don't use json suffix here as we don't want CI to pick it up
|
||||||
|
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
|
||||||
|
write_to_json(pt_file, pt_records)
|
||||||
|
|
||||||
|
|
||||||
|
def get_requests(args, tokenizer):
|
||||||
|
# Common parameters for all dataset types.
|
||||||
|
common_kwargs = {
|
||||||
|
"dataset_path": args.dataset_path,
|
||||||
|
"random_seed": args.seed,
|
||||||
|
}
|
||||||
|
sample_kwargs = {
|
||||||
|
"tokenizer": tokenizer,
|
||||||
|
"lora_path": args.lora_path,
|
||||||
|
"max_loras": args.max_loras,
|
||||||
|
"num_requests": args.num_prompts,
|
||||||
|
"input_len": args.input_len,
|
||||||
|
"output_len": args.output_len,
|
||||||
|
}
|
||||||
|
|
||||||
|
if args.dataset_path is None or args.dataset_name == "random":
|
||||||
|
sample_kwargs["range_ratio"] = args.random_range_ratio
|
||||||
|
sample_kwargs["prefix_len"] = args.prefix_len
|
||||||
|
dataset_cls = RandomDataset
|
||||||
|
elif args.dataset_name == "sharegpt":
|
||||||
|
dataset_cls = ShareGPTDataset
|
||||||
|
if args.backend == "vllm-chat":
|
||||||
|
sample_kwargs["enable_multimodal_chat"] = True
|
||||||
|
elif args.dataset_name == "sonnet":
|
||||||
|
assert tokenizer.chat_template or tokenizer.default_chat_template, (
|
||||||
|
"Tokenizer/model must have chat template for sonnet dataset.")
|
||||||
|
dataset_cls = SonnetDataset
|
||||||
|
sample_kwargs["prefix_len"] = args.prefix_len
|
||||||
|
sample_kwargs["return_prompt_formatted"] = True
|
||||||
|
elif args.dataset_name == "burstgpt":
|
||||||
|
dataset_cls = BurstGPTDataset
|
||||||
|
elif args.dataset_name == "hf":
|
||||||
|
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
|
||||||
|
dataset_cls = VisionArenaDataset
|
||||||
|
common_kwargs['dataset_subset'] = None
|
||||||
|
common_kwargs['dataset_split'] = "train"
|
||||||
|
sample_kwargs["enable_multimodal_chat"] = True
|
||||||
|
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
|
||||||
|
dataset_cls = InstructCoderDataset
|
||||||
|
common_kwargs['dataset_split'] = "train"
|
||||||
|
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
|
||||||
|
dataset_cls = ConversationDataset
|
||||||
|
common_kwargs['dataset_subset'] = args.hf_subset
|
||||||
|
common_kwargs['dataset_split'] = args.hf_split
|
||||||
|
sample_kwargs["enable_multimodal_chat"] = True
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown dataset name: {args.dataset_name}")
|
||||||
|
# Remove None values
|
||||||
|
sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None}
|
||||||
|
return dataset_cls(**common_kwargs).sample(**sample_kwargs)
|
||||||
|
|
||||||
|
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
|
if args.seed is None:
|
||||||
|
args.seed = 0
|
||||||
print(args)
|
print(args)
|
||||||
random.seed(args.seed)
|
random.seed(args.seed)
|
||||||
|
|
||||||
# Sample the requests.
|
# Sample the requests.
|
||||||
tokenizer = AutoTokenizer.from_pretrained(
|
tokenizer = AutoTokenizer.from_pretrained(
|
||||||
args.tokenizer, trust_remote_code=args.trust_remote_code)
|
args.tokenizer, trust_remote_code=args.trust_remote_code)
|
||||||
if args.dataset is None:
|
requests = get_requests(args, tokenizer)
|
||||||
vocab_size = tokenizer.vocab_size
|
|
||||||
requests = []
|
|
||||||
for _ in range(args.num_prompts):
|
|
||||||
|
|
||||||
request_tokenizer = tokenizer
|
|
||||||
lora_request: Optional[LoRARequest] = None
|
|
||||||
if args.enable_lora:
|
|
||||||
lora_request, lora_tokenizer = get_random_lora_request(args)
|
|
||||||
if lora_tokenizer:
|
|
||||||
request_tokenizer = lora_tokenizer
|
|
||||||
|
|
||||||
# Synthesize a prompt with the given input length.
|
|
||||||
candidate_ids = [
|
|
||||||
random.randint(0, vocab_size - 1)
|
|
||||||
for _ in range(args.input_len)
|
|
||||||
]
|
|
||||||
# As tokenizer may add additional tokens like BOS, we need to try
|
|
||||||
# different lengths to get the desired input length.
|
|
||||||
for _ in range(5): # Max attempts to correct
|
|
||||||
candidate_prompt = request_tokenizer.decode(candidate_ids)
|
|
||||||
tokenized_len = len(request_tokenizer.encode(candidate_prompt))
|
|
||||||
|
|
||||||
if tokenized_len == args.input_len:
|
|
||||||
break
|
|
||||||
|
|
||||||
# Adjust length based on difference
|
|
||||||
diff = args.input_len - tokenized_len
|
|
||||||
if diff > 0:
|
|
||||||
candidate_ids.extend([
|
|
||||||
random.randint(100, vocab_size - 100)
|
|
||||||
for _ in range(diff)
|
|
||||||
])
|
|
||||||
else:
|
|
||||||
candidate_ids = candidate_ids[:diff]
|
|
||||||
requests.append(
|
|
||||||
SampleRequest(prompt=candidate_prompt,
|
|
||||||
prompt_len=args.input_len,
|
|
||||||
expected_output_len=args.output_len,
|
|
||||||
lora_request=lora_request))
|
|
||||||
else:
|
|
||||||
requests = sample_requests(tokenizer, args)
|
|
||||||
|
|
||||||
is_multi_modal = any(request.multi_modal_data is not None
|
is_multi_modal = any(request.multi_modal_data is not None
|
||||||
for request in requests)
|
for request in requests)
|
||||||
|
request_outputs: Optional[list[RequestOutput]] = None
|
||||||
if args.backend == "vllm":
|
if args.backend == "vllm":
|
||||||
if args.async_engine:
|
if args.async_engine:
|
||||||
elapsed_time = uvloop.run(
|
elapsed_time = uvloop.run(
|
||||||
@ -397,31 +360,59 @@ def main(args: argparse.Namespace):
|
|||||||
args.n,
|
args.n,
|
||||||
AsyncEngineArgs.from_cli_args(args),
|
AsyncEngineArgs.from_cli_args(args),
|
||||||
args.disable_frontend_multiprocessing,
|
args.disable_frontend_multiprocessing,
|
||||||
|
args.disable_detokenize,
|
||||||
))
|
))
|
||||||
else:
|
else:
|
||||||
elapsed_time = run_vllm(requests, args.n,
|
elapsed_time, request_outputs = run_vllm(
|
||||||
EngineArgs.from_cli_args(args))
|
requests, args.n, EngineArgs.from_cli_args(args),
|
||||||
|
args.disable_detokenize)
|
||||||
elif args.backend == "hf":
|
elif args.backend == "hf":
|
||||||
assert args.tensor_parallel_size == 1
|
assert args.tensor_parallel_size == 1
|
||||||
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
||||||
args.hf_max_batch_size, args.trust_remote_code)
|
args.hf_max_batch_size, args.trust_remote_code,
|
||||||
|
args.disable_detokenize)
|
||||||
elif args.backend == "mii":
|
elif args.backend == "mii":
|
||||||
elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
|
elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
|
||||||
args.output_len)
|
args.output_len)
|
||||||
|
elif args.backend == "vllm-chat":
|
||||||
|
elapsed_time, request_outputs = run_vllm_chat(
|
||||||
|
requests, args.n, EngineArgs.from_cli_args(args),
|
||||||
|
args.disable_detokenize)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown backend: {args.backend}")
|
raise ValueError(f"Unknown backend: {args.backend}")
|
||||||
total_num_tokens = sum(request.prompt_len + request.expected_output_len
|
|
||||||
for request in requests)
|
if request_outputs:
|
||||||
total_output_tokens = sum(request.expected_output_len
|
# Note: with the vllm and vllm-chat backends,
|
||||||
for request in requests)
|
# we have request_outputs, which we use to count tokens.
|
||||||
if is_multi_modal:
|
total_prompt_tokens = 0
|
||||||
print("\033[91mWARNING\033[0m: Multi-modal request detected. The "
|
total_output_tokens = 0
|
||||||
|
for ro in request_outputs:
|
||||||
|
if not isinstance(ro, RequestOutput):
|
||||||
|
continue
|
||||||
|
total_prompt_tokens += len(
|
||||||
|
ro.prompt_token_ids) if ro.prompt_token_ids else 0
|
||||||
|
total_output_tokens += sum(
|
||||||
|
len(o.token_ids) for o in ro.outputs if o)
|
||||||
|
total_num_tokens = total_prompt_tokens + total_output_tokens
|
||||||
|
else:
|
||||||
|
total_num_tokens = sum(r.prompt_len + r.expected_output_len
|
||||||
|
for r in requests)
|
||||||
|
total_output_tokens = sum(r.expected_output_len for r in requests)
|
||||||
|
total_prompt_tokens = total_num_tokens - total_output_tokens
|
||||||
|
|
||||||
|
if is_multi_modal and args.backend != "vllm-chat":
|
||||||
|
print("\033[91mWARNING\033[0m: Multi-modal request with "
|
||||||
|
f"{args.backend} backend detected. The "
|
||||||
"following metrics are not accurate because image tokens are not"
|
"following metrics are not accurate because image tokens are not"
|
||||||
" counted. See vllm-project/vllm/issues/9778 for details.")
|
" counted. See vllm-project/vllm/issues/9778 for details.")
|
||||||
# TODO(vllm-project/vllm/issues/9778): Count molti-modal token length.
|
# TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
|
||||||
|
# vllm-chat backend counts the image tokens now
|
||||||
|
|
||||||
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
||||||
f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
|
f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
|
||||||
f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
|
f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
|
||||||
|
print(f"Total num prompt tokens: {total_prompt_tokens}")
|
||||||
|
print(f"Total num output tokens: {total_output_tokens}")
|
||||||
|
|
||||||
# Output JSON results if specified
|
# Output JSON results if specified
|
||||||
if args.output_json:
|
if args.output_json:
|
||||||
@ -434,20 +425,122 @@ def main(args: argparse.Namespace):
|
|||||||
}
|
}
|
||||||
with open(args.output_json, "w") as f:
|
with open(args.output_json, "w") as f:
|
||||||
json.dump(results, f, indent=4)
|
json.dump(results, f, indent=4)
|
||||||
|
save_to_pytorch_benchmark_format(args, results)
|
||||||
|
|
||||||
|
|
||||||
|
def validate_args(args):
|
||||||
|
"""
|
||||||
|
Validate command-line arguments.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# === Deprecation and Defaulting ===
|
||||||
|
if args.dataset is not None:
|
||||||
|
warnings.warn(
|
||||||
|
"The '--dataset' argument will be deprecated in the next release. "
|
||||||
|
"Please use '--dataset-name' and '--dataset-path' instead.",
|
||||||
|
stacklevel=2)
|
||||||
|
args.dataset_path = args.dataset
|
||||||
|
|
||||||
|
if not getattr(args, "tokenizer", None):
|
||||||
|
args.tokenizer = args.model
|
||||||
|
|
||||||
|
# === Backend Validation ===
|
||||||
|
valid_backends = {"vllm", "hf", "mii", "vllm-chat"}
|
||||||
|
if args.backend not in valid_backends:
|
||||||
|
raise ValueError(f"Unsupported backend: {args.backend}")
|
||||||
|
|
||||||
|
# === Dataset Configuration ===
|
||||||
|
if not args.dataset and not args.dataset_path:
|
||||||
|
print(
|
||||||
|
"When dataset path is not set, it will default to random dataset")
|
||||||
|
args.dataset_name = 'random'
|
||||||
|
if args.input_len is None:
|
||||||
|
raise ValueError("input_len must be provided for a random dataset")
|
||||||
|
|
||||||
|
# === Dataset Name Specific Checks ===
|
||||||
|
# --hf-subset and --hf-split: only used
|
||||||
|
# when dataset_name is 'hf'
|
||||||
|
if args.dataset_name != "hf" and (
|
||||||
|
getattr(args, "hf_subset", None) is not None
|
||||||
|
or getattr(args, "hf_split", None) is not None):
|
||||||
|
warnings.warn("--hf-subset and --hf-split will be ignored \
|
||||||
|
since --dataset-name is not 'hf'.",
|
||||||
|
stacklevel=2)
|
||||||
|
elif args.dataset_name == "hf":
|
||||||
|
if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS:
|
||||||
|
assert args.backend == "vllm-chat", "VisionArenaDataset needs to use vllm-chat as the backend." #noqa: E501
|
||||||
|
elif args.dataset_path in InstructCoderDataset.SUPPORTED_DATASET_PATHS:
|
||||||
|
assert args.backend == "vllm", "InstructCoder dataset needs to use vllm as the backend." #noqa: E501
|
||||||
|
elif args.dataset_path in ConversationDataset.SUPPORTED_DATASET_PATHS:
|
||||||
|
assert args.backend == "vllm-chat", "ConversationDataset needs to use vllm-chat as the backend." #noqa: E501
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
f"{args.dataset_path} is not supported by hf dataset.")
|
||||||
|
|
||||||
|
# --random-range-ratio: only used when dataset_name is 'random'
|
||||||
|
if args.dataset_name != 'random' and args.random_range_ratio is not None:
|
||||||
|
warnings.warn("--random-range-ratio will be ignored since \
|
||||||
|
--dataset-name is not 'random'.",
|
||||||
|
stacklevel=2)
|
||||||
|
|
||||||
|
# --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
|
||||||
|
# set.
|
||||||
|
if args.dataset_name not in {"random", "sonnet", None
|
||||||
|
} and args.prefix_len is not None:
|
||||||
|
warnings.warn("--prefix-len will be ignored since --dataset-name\
|
||||||
|
is not 'random', 'sonnet', or not set.",
|
||||||
|
stacklevel=2)
|
||||||
|
|
||||||
|
# === LoRA Settings ===
|
||||||
|
if getattr(args, "enable_lora", False) and args.backend != "vllm":
|
||||||
|
raise ValueError(
|
||||||
|
"LoRA benchmarking is only supported for vLLM backend")
|
||||||
|
if getattr(args, "enable_lora", False) and args.lora_path is None:
|
||||||
|
raise ValueError("LoRA path must be provided when enable_lora is True")
|
||||||
|
|
||||||
|
# === Backend-specific Validations ===
|
||||||
|
if args.backend == "hf" and args.hf_max_batch_size is None:
|
||||||
|
raise ValueError("HF max batch size is required for HF backend")
|
||||||
|
if args.backend != "hf" and args.hf_max_batch_size is not None:
|
||||||
|
raise ValueError("HF max batch size is only for HF backend.")
|
||||||
|
|
||||||
|
if args.backend in {"hf", "mii"} and getattr(args, "quantization",
|
||||||
|
None) is not None:
|
||||||
|
raise ValueError("Quantization is only for vLLM backend.")
|
||||||
|
|
||||||
|
if args.backend == "mii" and args.dtype != "auto":
|
||||||
|
raise ValueError("dtype must be auto for MII backend.")
|
||||||
|
if args.backend == "mii" and args.n != 1:
|
||||||
|
raise ValueError("n must be 1 for MII backend.")
|
||||||
|
if args.backend == "mii" and args.tokenizer != args.model:
|
||||||
|
raise ValueError(
|
||||||
|
"Tokenizer must be the same as the model for MII backend.")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
|
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
|
||||||
parser.add_argument("--backend",
|
parser.add_argument("--backend",
|
||||||
type=str,
|
type=str,
|
||||||
choices=["vllm", "hf", "mii"],
|
choices=["vllm", "hf", "mii", "vllm-chat"],
|
||||||
default="vllm")
|
default="vllm")
|
||||||
parser.add_argument("--dataset",
|
parser.add_argument(
|
||||||
|
"--dataset-name",
|
||||||
|
type=str,
|
||||||
|
choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
|
||||||
|
help="Name of the dataset to benchmark on.",
|
||||||
|
default="sharegpt")
|
||||||
|
parser.add_argument(
|
||||||
|
"--dataset",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Path to the ShareGPT dataset, will be deprecated in\
|
||||||
|
the next release. The dataset is expected to "
|
||||||
|
"be a json in form of list[dict[..., conversations: "
|
||||||
|
"list[dict[..., value: <prompt_or_response>]]]]")
|
||||||
|
parser.add_argument("--dataset-path",
|
||||||
type=str,
|
type=str,
|
||||||
default=None,
|
default=None,
|
||||||
help="Path to the dataset. The dataset is expected to "
|
help="Path to the dataset")
|
||||||
"be a json in form of List[Dict[..., conversations: "
|
|
||||||
"List[Dict[..., value: <prompt_or_response>]]]]")
|
|
||||||
parser.add_argument("--input-len",
|
parser.add_argument("--input-len",
|
||||||
type=int,
|
type=int,
|
||||||
default=None,
|
default=None,
|
||||||
@ -482,6 +575,11 @@ if __name__ == "__main__":
|
|||||||
action='store_true',
|
action='store_true',
|
||||||
default=False,
|
default=False,
|
||||||
help="Disable decoupled async engine frontend.")
|
help="Disable decoupled async engine frontend.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--disable-detokenize",
|
||||||
|
action="store_true",
|
||||||
|
help=("Do not detokenize the response (i.e. do not include "
|
||||||
|
"detokenization time in the measurement)"))
|
||||||
# LoRA
|
# LoRA
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--lora-path",
|
"--lora-path",
|
||||||
@ -489,43 +587,33 @@ if __name__ == "__main__":
|
|||||||
default=None,
|
default=None,
|
||||||
help="Path to the lora adapters to use. This can be an absolute path, "
|
help="Path to the lora adapters to use. This can be an absolute path, "
|
||||||
"a relative path, or a Hugging Face model identifier.")
|
"a relative path, or a Hugging Face model identifier.")
|
||||||
|
parser.add_argument("--prefix-len",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Number of prefix tokens per request."
|
||||||
|
"This is for the RandomDataset and SonnetDataset")
|
||||||
|
# random dataset
|
||||||
|
parser.add_argument(
|
||||||
|
"--random-range-ratio",
|
||||||
|
type=float,
|
||||||
|
default=None,
|
||||||
|
help="Range of sampled ratio of input/output length, "
|
||||||
|
"used only for RandomDataSet.",
|
||||||
|
)
|
||||||
|
|
||||||
|
# hf dtaset
|
||||||
|
parser.add_argument("--hf-subset",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Subset of the HF dataset.")
|
||||||
|
parser.add_argument("--hf-split",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Split of the HF dataset.")
|
||||||
|
|
||||||
parser = AsyncEngineArgs.add_cli_args(parser)
|
parser = AsyncEngineArgs.add_cli_args(parser)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.tokenizer is None:
|
if args.tokenizer is None:
|
||||||
args.tokenizer = args.model
|
args.tokenizer = args.model
|
||||||
if args.dataset is None:
|
validate_args(args)
|
||||||
assert args.input_len is not None
|
|
||||||
assert args.output_len is not None
|
|
||||||
else:
|
|
||||||
assert args.input_len is None
|
|
||||||
if args.enable_lora:
|
|
||||||
assert args.lora_path is not None
|
|
||||||
|
|
||||||
if args.backend == "vllm":
|
|
||||||
if args.hf_max_batch_size is not None:
|
|
||||||
raise ValueError("HF max batch size is only for HF backend.")
|
|
||||||
elif args.backend == "hf":
|
|
||||||
if args.hf_max_batch_size is None:
|
|
||||||
raise ValueError("HF max batch size is required for HF backend.")
|
|
||||||
if args.quantization is not None:
|
|
||||||
raise ValueError("Quantization is only for vLLM backend.")
|
|
||||||
if args.enable_lora is not None:
|
|
||||||
raise ValueError("LoRA benchmarking is only supported for vLLM"
|
|
||||||
" backend")
|
|
||||||
elif args.backend == "mii":
|
|
||||||
if args.dtype != "auto":
|
|
||||||
raise ValueError("dtype must be auto for MII backend.")
|
|
||||||
if args.n != 1:
|
|
||||||
raise ValueError("n must be 1 for MII backend.")
|
|
||||||
if args.quantization is not None:
|
|
||||||
raise ValueError("Quantization is only for vLLM backend.")
|
|
||||||
if args.hf_max_batch_size is not None:
|
|
||||||
raise ValueError("HF max batch size is only for HF backend.")
|
|
||||||
if args.tokenizer != args.model:
|
|
||||||
raise ValueError("Tokenizer must be the same as the model for MII "
|
|
||||||
"backend.")
|
|
||||||
if args.enable_lora is not None:
|
|
||||||
raise ValueError("LoRA benchmarking is only supported for vLLM"
|
|
||||||
" backend")
|
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
69
benchmarks/benchmark_utils.py
Normal file
69
benchmarks/benchmark_utils.py
Normal file
@ -0,0 +1,69 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import math
|
||||||
|
import os
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
|
def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||||
|
metrics: dict[str, list],
|
||||||
|
extra_info: dict[str, Any]) -> list:
|
||||||
|
"""
|
||||||
|
Save the benchmark results in the format used by PyTorch OSS benchmark with
|
||||||
|
on metric per record
|
||||||
|
https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
|
||||||
|
"""
|
||||||
|
records = []
|
||||||
|
if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
|
||||||
|
return records
|
||||||
|
|
||||||
|
for name, benchmark_values in metrics.items():
|
||||||
|
record = {
|
||||||
|
"benchmark": {
|
||||||
|
"name": "vLLM benchmark",
|
||||||
|
"extra_info": {
|
||||||
|
"args": vars(args),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"model": {
|
||||||
|
"name": args.model,
|
||||||
|
},
|
||||||
|
"metric": {
|
||||||
|
"name": name,
|
||||||
|
"benchmark_values": benchmark_values,
|
||||||
|
"extra_info": extra_info,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
tp = record["benchmark"]["extra_info"]["args"].get(
|
||||||
|
"tensor_parallel_size")
|
||||||
|
# Save tensor_parallel_size parameter if it's part of the metadata
|
||||||
|
if not tp and "tensor_parallel_size" in extra_info:
|
||||||
|
record["benchmark"]["extra_info"]["args"][
|
||||||
|
"tensor_parallel_size"] = extra_info["tensor_parallel_size"]
|
||||||
|
|
||||||
|
records.append(record)
|
||||||
|
|
||||||
|
return records
|
||||||
|
|
||||||
|
|
||||||
|
class InfEncoder(json.JSONEncoder):
|
||||||
|
|
||||||
|
def clear_inf(self, o: Any):
|
||||||
|
if isinstance(o, dict):
|
||||||
|
return {k: self.clear_inf(v) for k, v in o.items()}
|
||||||
|
elif isinstance(o, list):
|
||||||
|
return [self.clear_inf(v) for v in o]
|
||||||
|
elif isinstance(o, float) and math.isinf(o):
|
||||||
|
return "inf"
|
||||||
|
return o
|
||||||
|
|
||||||
|
def iterencode(self, o: Any, *args, **kwargs) -> Any:
|
||||||
|
return super().iterencode(self.clear_inf(o), *args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
def write_to_json(filename: str, records: list) -> None:
|
||||||
|
with open(filename, "w") as f:
|
||||||
|
json.dump(records, f, cls=InfEncoder)
|
||||||
@ -1,9 +1,12 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import copy
|
import copy
|
||||||
import itertools
|
import itertools
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
from typing import Callable, Iterable, List, Tuple
|
from collections.abc import Iterable
|
||||||
|
from typing import Callable
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
@ -226,7 +229,7 @@ def print_timers(timers: Iterable[TMeasurement]):
|
|||||||
|
|
||||||
|
|
||||||
def run(dtype: torch.dtype,
|
def run(dtype: torch.dtype,
|
||||||
MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
|
MKNs: Iterable[tuple[int, int, int]]) -> Iterable[TMeasurement]:
|
||||||
results = []
|
results = []
|
||||||
for m, k, n in MKNs:
|
for m, k, n in MKNs:
|
||||||
timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
|
timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
|
||||||
@ -239,7 +242,7 @@ def run(dtype: torch.dtype,
|
|||||||
|
|
||||||
# output makers
|
# output makers
|
||||||
def make_output(data: Iterable[TMeasurement],
|
def make_output(data: Iterable[TMeasurement],
|
||||||
MKNs: Iterable[Tuple[int, int, int]],
|
MKNs: Iterable[tuple[int, int, int]],
|
||||||
base_description: str,
|
base_description: str,
|
||||||
timestamp=None):
|
timestamp=None):
|
||||||
print(f"== All Results {base_description} ====")
|
print(f"== All Results {base_description} ====")
|
||||||
@ -280,7 +283,7 @@ def run_model_bench(args):
|
|||||||
for i, model in enumerate(args.models):
|
for i, model in enumerate(args.models):
|
||||||
print(f"[{i}] {model}")
|
print(f"[{i}] {model}")
|
||||||
|
|
||||||
def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
|
def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]:
|
||||||
KNs = []
|
KNs = []
|
||||||
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
|
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
|
||||||
KN[tp_split_dim] = KN[tp_split_dim] // tp_size
|
KN[tp_split_dim] = KN[tp_split_dim] // tp_size
|
||||||
|
|||||||
@ -1,5 +1,7 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
# Cutlass bench utils
|
# Cutlass bench utils
|
||||||
from typing import Iterable, Tuple
|
from collections.abc import Iterable
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
@ -25,7 +27,7 @@ def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
|
|||||||
|
|
||||||
|
|
||||||
def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
|
def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
|
||||||
k: int) -> Tuple[torch.Tensor, torch.Tensor]:
|
k: int) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
a = torch.randn((m, k), device='cuda') * 5
|
a = torch.randn((m, k), device='cuda') * 5
|
||||||
b = torch.randn((n, k), device='cuda').t() * 5
|
b = torch.randn((n, k), device='cuda').t() * 5
|
||||||
|
|
||||||
@ -61,7 +63,7 @@ def prune_to_2_4(tensor):
|
|||||||
|
|
||||||
|
|
||||||
def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
|
def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
|
||||||
k: int) -> Tuple[torch.Tensor, torch.Tensor]:
|
k: int) -> tuple[torch.Tensor, torch.Tensor]:
|
||||||
a = torch.randn((m, k), device='cuda') * 5
|
a = torch.randn((m, k), device='cuda') * 5
|
||||||
b = torch.randn((n, k), device='cuda').t() * 5
|
b = torch.randn((n, k), device='cuda').t() * 5
|
||||||
|
|
||||||
@ -86,7 +88,7 @@ def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
|
|||||||
|
|
||||||
def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype,
|
def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype,
|
||||||
m: int, n: int, k: int) -> \
|
m: int, n: int, k: int) -> \
|
||||||
Tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
|
tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
|
||||||
ABs = []
|
ABs = []
|
||||||
for _ in range(num_tensors):
|
for _ in range(num_tensors):
|
||||||
b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
|
b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)
|
||||||
|
|||||||
@ -1,9 +1,12 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
import copy
|
import copy
|
||||||
import itertools
|
import itertools
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
from typing import Callable, Iterable, List, Tuple
|
from collections.abc import Iterable
|
||||||
|
from typing import Callable, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
@ -12,6 +15,8 @@ from utils import make_rand_tensors
|
|||||||
from weight_shapes import WEIGHT_SHAPES
|
from weight_shapes import WEIGHT_SHAPES
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
|
w8a8_block_fp8_matmul)
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
||||||
@ -38,8 +43,15 @@ def bench_fn(label: str, sub_label: str, description: str, fn: Callable, *args,
|
|||||||
).blocked_autorange(min_run_time=min_run_time)
|
).blocked_autorange(min_run_time=min_run_time)
|
||||||
|
|
||||||
|
|
||||||
def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
def bench_int8(
|
||||||
sub_label: str) -> Iterable[TMeasurement]:
|
dtype: torch.dtype,
|
||||||
|
m: int,
|
||||||
|
k: int,
|
||||||
|
n: int,
|
||||||
|
label: str,
|
||||||
|
sub_label: str,
|
||||||
|
bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
|
||||||
|
"""Benchmark INT8-based kernels."""
|
||||||
assert dtype == torch.int8
|
assert dtype == torch.int8
|
||||||
a, b = make_rand_tensors(torch.int8, m, n, k)
|
a, b = make_rand_tensors(torch.int8, m, n, k)
|
||||||
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
@ -48,155 +60,132 @@ def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
|||||||
azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
|
azp = torch.zeros((m, ), device="cuda", dtype=torch.int32)
|
||||||
azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)
|
azp_adj = torch.zeros((n, ), device="cuda", dtype=torch.int32)
|
||||||
|
|
||||||
|
bench_fns = {
|
||||||
|
"pytorch_bf16_bf16_bf16_matmul-no-scales":
|
||||||
|
lambda: torch.mm(a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
|
||||||
|
),
|
||||||
|
"pytorch_fp16_fp16_fp16_matmul-no-scales":
|
||||||
|
lambda: torch.mm(a.to(dtype=torch.float16), b.to(dtype=torch.float16)),
|
||||||
|
"cutlass_i8_i8_bf16_scaled_mm":
|
||||||
|
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16),
|
||||||
|
"cutlass_i8_i8_bf16_scaled_mm_bias":
|
||||||
|
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16,
|
||||||
|
bias),
|
||||||
|
"cutlass_i8_i8_bf16_scaled_mm_azp":
|
||||||
|
lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
|
||||||
|
bfloat16, azp_adj),
|
||||||
|
"cutlass_i8_i8_bf16_scaled_mm_azp_bias":
|
||||||
|
lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
|
||||||
|
bfloat16, azp_adj, None, bias),
|
||||||
|
"cutlass_i8_i8_bf16_scaled_mm_azp_pt":
|
||||||
|
lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
|
||||||
|
bfloat16, azp_adj, azp),
|
||||||
|
"cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias":
|
||||||
|
lambda: ops.cutlass_scaled_mm_azp(a, b, scale_a, scale_b, torch.
|
||||||
|
bfloat16, azp_adj, azp, bias),
|
||||||
|
}
|
||||||
|
|
||||||
timers = []
|
timers = []
|
||||||
# pytorch impl - bfloat16
|
for name, fn in bench_fns.items():
|
||||||
timers.append(
|
# If bench_kernels is None, run all. Otherwise, run only exact matches.
|
||||||
bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
|
if bench_kernels is None or name in bench_kernels:
|
||||||
torch.mm, a.to(dtype=torch.bfloat16),
|
print(f"Running {name}")
|
||||||
b.to(dtype=torch.bfloat16)))
|
timers.append(bench_fn(label, sub_label, name, fn))
|
||||||
|
|
||||||
# pytorch impl - float16
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label, sub_label,
|
|
||||||
"pytorch_fp16_fp16_fp16_matmul-no-scales", torch.mm,
|
|
||||||
a.to(dtype=torch.float16), b.to(dtype=torch.float16)))
|
|
||||||
|
|
||||||
# cutlass impl
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm",
|
|
||||||
ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
|
|
||||||
torch.bfloat16))
|
|
||||||
|
|
||||||
# cutlass with bias
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_bias",
|
|
||||||
ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
|
|
||||||
bias))
|
|
||||||
|
|
||||||
# cutlass with azp per-tensor
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp",
|
|
||||||
ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
|
|
||||||
torch.bfloat16, azp_adj))
|
|
||||||
|
|
||||||
# cutlass with azp per-tensor + bias
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_bias",
|
|
||||||
ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
|
|
||||||
torch.bfloat16, azp_adj, None, bias))
|
|
||||||
|
|
||||||
# cutlass with azp per-token
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt",
|
|
||||||
ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
|
|
||||||
torch.bfloat16, azp_adj, azp))
|
|
||||||
|
|
||||||
# cutlass with azp per-token + bias
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label, sub_label, "cutlass_i8_i8_bf16_scaled_mm_azp_pt_bias",
|
|
||||||
ops.cutlass_scaled_mm_azp, a, b, scale_a, scale_b,
|
|
||||||
torch.bfloat16, azp_adj, azp, bias))
|
|
||||||
|
|
||||||
return timers
|
return timers
|
||||||
|
|
||||||
|
|
||||||
def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
def bench_fp8(
|
||||||
sub_label: str) -> Iterable[TMeasurement]:
|
dtype: torch.dtype,
|
||||||
|
m: int,
|
||||||
|
k: int,
|
||||||
|
n: int,
|
||||||
|
label: str,
|
||||||
|
sub_label: str,
|
||||||
|
bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
|
||||||
|
"""Benchmark FP8-based kernels."""
|
||||||
assert dtype == torch.float8_e4m3fn
|
assert dtype == torch.float8_e4m3fn
|
||||||
a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
|
a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
|
||||||
|
a_cont = a.contiguous()
|
||||||
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
|
block_scale_a = torch.rand((m, k // 128),
|
||||||
|
device="cuda",
|
||||||
|
dtype=torch.float32)
|
||||||
|
block_scale_b = torch.rand((k // 128, n // 128),
|
||||||
|
device="cuda",
|
||||||
|
dtype=torch.float32)
|
||||||
|
block_scale_a_M_major = block_scale_a.t().contiguous().t()
|
||||||
|
block_scale_b_K_major = block_scale_b.t().contiguous().t()
|
||||||
bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
|
bias = torch.zeros((n, ), device="cuda", dtype=torch.bfloat16)
|
||||||
|
|
||||||
|
print(m, k, n)
|
||||||
|
|
||||||
|
bench_fns = {
|
||||||
|
"pytorch_bf16_bf16_bf16_matmul-no-scales":
|
||||||
|
lambda: torch.mm(a.to(dtype=torch.bfloat16), b.to(dtype=torch.bfloat16)
|
||||||
|
),
|
||||||
|
"pytorch_fp16_fp16_fp16_matmul-no-scales":
|
||||||
|
lambda: torch.mm(a.to(dtype=torch.float16), b.to(dtype=torch.float16)),
|
||||||
|
"pytorch_fp8_fp8_fp16_scaled_mm":
|
||||||
|
lambda: torch._scaled_mm(
|
||||||
|
a, b, scale_a, scale_b, out_dtype=torch.float16),
|
||||||
|
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum":
|
||||||
|
lambda: torch._scaled_mm(a,
|
||||||
|
b,
|
||||||
|
scale_a,
|
||||||
|
scale_b,
|
||||||
|
out_dtype=torch.float16,
|
||||||
|
use_fast_accum=True),
|
||||||
|
"pytorch_fp8_fp8_bf16_scaled_mm":
|
||||||
|
lambda: torch._scaled_mm(
|
||||||
|
a, b, scale_a, scale_b, out_dtype=torch.bfloat16),
|
||||||
|
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum":
|
||||||
|
lambda: torch._scaled_mm(a,
|
||||||
|
b,
|
||||||
|
scale_a,
|
||||||
|
scale_b,
|
||||||
|
out_dtype=torch.bfloat16,
|
||||||
|
use_fast_accum=True),
|
||||||
|
"cutlass_fp8_fp8_bf16_scaled_mm":
|
||||||
|
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16),
|
||||||
|
"cutlass_fp8_fp8_fp16_scaled_mm":
|
||||||
|
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.float16),
|
||||||
|
"cutlass_fp8_fp8_bf16_scaled_mm_bias":
|
||||||
|
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.bfloat16,
|
||||||
|
bias),
|
||||||
|
"cutlass_fp8_fp8_fp16_scaled_mm_bias":
|
||||||
|
lambda: ops.cutlass_scaled_mm(a, b, scale_a, scale_b, torch.float16,
|
||||||
|
bias.to(dtype=torch.float16)),
|
||||||
|
"triton_fp8_fp8_fp16_scaled_mm_blockwise":
|
||||||
|
lambda: w8a8_block_fp8_matmul(a_cont, b.t(), block_scale_a,
|
||||||
|
block_scale_b.t(), (128, 128)),
|
||||||
|
"cutlass_fp8_fp8_fp16_scaled_mm_blockwise":
|
||||||
|
lambda: ops.cutlass_scaled_mm(a, b, block_scale_a_M_major,
|
||||||
|
block_scale_b_K_major, torch.float16),
|
||||||
|
}
|
||||||
|
|
||||||
timers = []
|
timers = []
|
||||||
|
for name, fn in bench_fns.items():
|
||||||
# pytorch impl w. bf16
|
# If bench_kernels is None, run all. Otherwise, run only exact matches.
|
||||||
timers.append(
|
if bench_kernels is None or name in bench_kernels:
|
||||||
bench_fn(label, sub_label, "pytorch_bf16_bf16_bf16_matmul-no-scales",
|
print(f"Running {name}")
|
||||||
torch.mm, a.to(dtype=torch.bfloat16, device="cuda"),
|
timers.append(bench_fn(label, sub_label, name, fn))
|
||||||
b.to(dtype=torch.bfloat16, device="cuda")))
|
|
||||||
|
|
||||||
# pytorch impl: bf16 output, without fp8 fast accum
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label,
|
|
||||||
sub_label,
|
|
||||||
"pytorch_fp8_fp8_bf16_scaled_mm",
|
|
||||||
torch._scaled_mm,
|
|
||||||
a,
|
|
||||||
b,
|
|
||||||
scale_a=scale_a,
|
|
||||||
scale_b=scale_b,
|
|
||||||
out_dtype=torch.bfloat16))
|
|
||||||
|
|
||||||
# pytorch impl: bf16 output, with fp8 fast accum
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label,
|
|
||||||
sub_label,
|
|
||||||
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum",
|
|
||||||
torch._scaled_mm,
|
|
||||||
a,
|
|
||||||
b,
|
|
||||||
scale_a=scale_a,
|
|
||||||
scale_b=scale_b,
|
|
||||||
out_dtype=torch.bfloat16,
|
|
||||||
use_fast_accum=True))
|
|
||||||
|
|
||||||
# pytorch impl: fp16 output, without fp8 fast accum
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label,
|
|
||||||
sub_label,
|
|
||||||
"pytorch_fp8_fp8_fp16_scaled_mm",
|
|
||||||
torch._scaled_mm,
|
|
||||||
a,
|
|
||||||
b,
|
|
||||||
scale_a=scale_a,
|
|
||||||
scale_b=scale_b,
|
|
||||||
out_dtype=torch.float16))
|
|
||||||
|
|
||||||
# pytorch impl: fp16 output, with fp8 fast accum
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label,
|
|
||||||
sub_label,
|
|
||||||
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum",
|
|
||||||
torch._scaled_mm,
|
|
||||||
a,
|
|
||||||
b,
|
|
||||||
scale_a=scale_a,
|
|
||||||
scale_b=scale_b,
|
|
||||||
out_dtype=torch.float16,
|
|
||||||
use_fast_accum=True))
|
|
||||||
|
|
||||||
# cutlass impl: bf16 output
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm",
|
|
||||||
ops.cutlass_scaled_mm, a, b, scale_a, scale_b,
|
|
||||||
torch.bfloat16))
|
|
||||||
# cutlass impl: fp16 output
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm",
|
|
||||||
ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16))
|
|
||||||
|
|
||||||
# cutlass impl: bf16 output, with bias
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label, sub_label, "cutlass_fp8_fp8_bf16_scaled_mm_bias",
|
|
||||||
ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.bfloat16,
|
|
||||||
bias))
|
|
||||||
|
|
||||||
# cutlass impl: fp16 output, with bias
|
|
||||||
timers.append(
|
|
||||||
bench_fn(label, sub_label, "cutlass_fp8_fp8_fp16_scaled_mm_bias",
|
|
||||||
ops.cutlass_scaled_mm, a, b, scale_a, scale_b, torch.float16,
|
|
||||||
bias.to(dtype=torch.float16)))
|
|
||||||
|
|
||||||
return timers
|
return timers
|
||||||
|
|
||||||
|
|
||||||
def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
def bench(dtype: torch.dtype,
|
||||||
sub_label: str) -> Iterable[TMeasurement]:
|
m: int,
|
||||||
|
k: int,
|
||||||
|
n: int,
|
||||||
|
label: str,
|
||||||
|
sub_label: str,
|
||||||
|
bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
|
||||||
if dtype == torch.int8:
|
if dtype == torch.int8:
|
||||||
return bench_int8(dtype, m, k, n, label, sub_label)
|
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
|
||||||
if dtype == torch.float8_e4m3fn:
|
if dtype == torch.float8_e4m3fn:
|
||||||
return bench_fp8(dtype, m, k, n, label, sub_label)
|
return bench_fp8(dtype, m, k, n, label, sub_label, bench_kernels)
|
||||||
raise ValueError("unsupported type")
|
raise ValueError("unsupported type")
|
||||||
|
|
||||||
|
|
||||||
@ -207,20 +196,24 @@ def print_timers(timers: Iterable[TMeasurement]):
|
|||||||
|
|
||||||
|
|
||||||
def run(dtype: torch.dtype,
|
def run(dtype: torch.dtype,
|
||||||
MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
|
MKNs: Iterable[tuple[int, int, int]],
|
||||||
|
bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
|
||||||
results = []
|
results = []
|
||||||
for m, k, n in MKNs:
|
for m, k, n in MKNs:
|
||||||
timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
|
timers = bench(dtype,
|
||||||
f"MKN=({m}x{k}x{n})")
|
m,
|
||||||
|
k,
|
||||||
|
n,
|
||||||
|
f"scaled-{dtype}-gemm",
|
||||||
|
f"MKN=({m}x{k}x{n})",
|
||||||
|
bench_kernels=bench_kernels)
|
||||||
print_timers(timers)
|
print_timers(timers)
|
||||||
results.extend(timers)
|
results.extend(timers)
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
# output makers
|
|
||||||
def make_output(data: Iterable[TMeasurement],
|
def make_output(data: Iterable[TMeasurement],
|
||||||
MKNs: Iterable[Tuple[int, int, int]],
|
MKNs: Iterable[tuple[int, int, int]],
|
||||||
base_description: str,
|
base_description: str,
|
||||||
timestamp=None):
|
timestamp=None):
|
||||||
print(f"== All Results {base_description} ====")
|
print(f"== All Results {base_description} ====")
|
||||||
@ -232,15 +225,11 @@ def make_output(data: Iterable[TMeasurement],
|
|||||||
pkl.dump(data, f)
|
pkl.dump(data, f)
|
||||||
|
|
||||||
|
|
||||||
# argparse runners
|
|
||||||
|
|
||||||
|
|
||||||
def run_square_bench(args):
|
def run_square_bench(args):
|
||||||
dim_sizes = list(
|
dim_sizes = list(
|
||||||
range(args.dim_start, args.dim_end + 1, args.dim_increment))
|
range(args.dim_start, args.dim_end + 1, args.dim_increment))
|
||||||
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
|
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
|
||||||
data = run(args.dtype, MKNs)
|
data = run(args.dtype, MKNs, bench_kernels=args.kernels)
|
||||||
|
|
||||||
make_output(data, MKNs, f"square_bench-{args.dtype}")
|
make_output(data, MKNs, f"square_bench-{args.dtype}")
|
||||||
|
|
||||||
|
|
||||||
@ -251,8 +240,7 @@ def run_range_bench(args):
|
|||||||
Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
|
Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
|
||||||
Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
|
Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
|
||||||
MKNs = list(zip(Ms, Ks, Ns))
|
MKNs = list(zip(Ms, Ks, Ns))
|
||||||
data = run(args.dtype, MKNs)
|
data = run(args.dtype, MKNs, bench_kernels=args.kernels)
|
||||||
|
|
||||||
make_output(data, MKNs, f"range_bench-{args.dtype}")
|
make_output(data, MKNs, f"range_bench-{args.dtype}")
|
||||||
|
|
||||||
|
|
||||||
@ -261,7 +249,7 @@ def run_model_bench(args):
|
|||||||
for i, model in enumerate(args.models):
|
for i, model in enumerate(args.models):
|
||||||
print(f"[{i}] {model}")
|
print(f"[{i}] {model}")
|
||||||
|
|
||||||
def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
|
def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]:
|
||||||
KNs = []
|
KNs = []
|
||||||
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
|
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
|
||||||
KN[tp_split_dim] = KN[tp_split_dim] // tp_size
|
KN[tp_split_dim] = KN[tp_split_dim] // tp_size
|
||||||
@ -278,7 +266,7 @@ def run_model_bench(args):
|
|||||||
for k, n in KNs:
|
for k, n in KNs:
|
||||||
MKNs.append((m, k, n))
|
MKNs.append((m, k, n))
|
||||||
|
|
||||||
data = run(args.dtype, MKNs)
|
data = run(args.dtype, MKNs, bench_kernels=args.kernels)
|
||||||
model_bench_data.append(data)
|
model_bench_data.append(data)
|
||||||
|
|
||||||
# Print all results
|
# Print all results
|
||||||
@ -328,6 +316,15 @@ Benchmark Cutlass GEMM.
|
|||||||
type=to_torch_dtype,
|
type=to_torch_dtype,
|
||||||
required=True,
|
required=True,
|
||||||
help="Available options are ['int8', 'fp8']")
|
help="Available options are ['int8', 'fp8']")
|
||||||
|
parser.add_argument(
|
||||||
|
"--kernels",
|
||||||
|
nargs="+",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help=
|
||||||
|
"Exact names of the kernels to benchmark. If not set, runs all kernels."
|
||||||
|
)
|
||||||
|
|
||||||
subparsers = parser.add_subparsers(dest="cmd")
|
subparsers = parser.add_subparsers(dest="cmd")
|
||||||
|
|
||||||
square_parser = subparsers.add_parser("square_bench")
|
square_parser = subparsers.add_parser("square_bench")
|
||||||
@ -362,4 +359,4 @@ Benchmark Cutlass GEMM.
|
|||||||
model_parser.set_defaults(func=run_model_bench)
|
model_parser.set_defaults(func=run_model_bench)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
args.func(args)
|
args.func(args)
|
||||||
|
|||||||
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
# Weight Shapes are in the format
|
# Weight Shapes are in the format
|
||||||
# ([K, N], TP_SPLIT_DIM)
|
# ([K, N], TP_SPLIT_DIM)
|
||||||
# Example:
|
# Example:
|
||||||
|
|||||||
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
|
|||||||
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
|
|||||||
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
|||||||
@ -1,8 +1,11 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
|
from collections.abc import Iterable
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from itertools import product
|
from itertools import product
|
||||||
from typing import Callable, Iterable, List, Optional
|
from typing import Callable, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
@ -27,7 +30,7 @@ class bench_params_t:
|
|||||||
f'x DT {self.dtype}')
|
f'x DT {self.dtype}')
|
||||||
|
|
||||||
|
|
||||||
def get_bench_params() -> List[bench_params_t]:
|
def get_bench_params() -> list[bench_params_t]:
|
||||||
## Test Fixtures
|
## Test Fixtures
|
||||||
NUM_TOKENS = [2**x for x in range(11)]
|
NUM_TOKENS = [2**x for x in range(11)]
|
||||||
HIDDEN_SIZES = list(range(1024, 8129, 1024))
|
HIDDEN_SIZES = list(range(1024, 8129, 1024))
|
||||||
|
|||||||
@ -1,3 +1,5 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|||||||
340
benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
Normal file
340
benchmarks/kernels/benchmark_grouped_gemm_cutlass.py
Normal file
@ -0,0 +1,340 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.utils.benchmark as benchmark
|
||||||
|
from benchmark_shapes import WEIGHT_SHAPES_MOE
|
||||||
|
|
||||||
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||||
|
from vllm.model_executor.layers.fused_moe.fused_moe import (cutlass_moe_fp8,
|
||||||
|
fused_experts,
|
||||||
|
fused_topk)
|
||||||
|
from vllm.utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
DEFAULT_MODELS = [
|
||||||
|
"nm-testing/Mixtral-8x7B-Instruct-v0.1", "nm-testing/deepseekv2-lite",
|
||||||
|
"ibm-granite/granite-3.0-1b-a400m", "ibm-granite/granite-3.0-3b-a800m"
|
||||||
|
]
|
||||||
|
DEFAULT_BATCH_SIZES = [1, 4, 8, 16, 32, 64, 128, 256, 512]
|
||||||
|
DEFAULT_TP_SIZES = [1]
|
||||||
|
|
||||||
|
PER_ACT_TOKEN_OPTS = [False]
|
||||||
|
PER_OUT_CH_OPTS = [False]
|
||||||
|
|
||||||
|
|
||||||
|
def to_fp8(tensor: torch.Tensor):
|
||||||
|
finfo = torch.finfo(torch.float8_e4m3fn)
|
||||||
|
return torch.round(tensor.clamp(
|
||||||
|
min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
|
||||||
|
|
||||||
|
|
||||||
|
def bench_run(results: list[benchmark.Measurement], model: str,
|
||||||
|
num_experts: int, topk: int, per_act_token: bool,
|
||||||
|
per_out_ch: bool, mkn: tuple[int, int, int]):
|
||||||
|
label = "Quant Matmul"
|
||||||
|
|
||||||
|
sub_label = (
|
||||||
|
"{}, num_experts={}, topk={}, per_act_token={} per_out_ch={}, "
|
||||||
|
"MKN=({})".format(model, num_experts, topk, per_act_token, per_out_ch,
|
||||||
|
mkn))
|
||||||
|
|
||||||
|
print(f"Testing: {sub_label}")
|
||||||
|
|
||||||
|
(m, k, n) = mkn
|
||||||
|
|
||||||
|
dtype = torch.half
|
||||||
|
|
||||||
|
a = torch.randn((m, k), device="cuda", dtype=dtype) / 10
|
||||||
|
w1 = torch.randn((num_experts, 2 * n, k), device="cuda", dtype=dtype) / 10
|
||||||
|
w2 = torch.randn((num_experts, k, n), device="cuda", dtype=dtype) / 10
|
||||||
|
|
||||||
|
_, a_scale = ops.scaled_fp8_quant(a)
|
||||||
|
|
||||||
|
w1_q = torch.empty((num_experts, 2 * n, k),
|
||||||
|
device="cuda",
|
||||||
|
dtype=torch.float8_e4m3fn)
|
||||||
|
w2_q = torch.empty((num_experts, k, n),
|
||||||
|
device="cuda",
|
||||||
|
dtype=torch.float8_e4m3fn)
|
||||||
|
w1_scale = torch.empty((num_experts, 1, 1),
|
||||||
|
device="cuda",
|
||||||
|
dtype=torch.float32)
|
||||||
|
w2_scale = torch.empty((num_experts, 1, 1),
|
||||||
|
device="cuda",
|
||||||
|
dtype=torch.float32)
|
||||||
|
|
||||||
|
ab_strides1 = torch.full((num_experts, ),
|
||||||
|
k,
|
||||||
|
device="cuda",
|
||||||
|
dtype=torch.int64)
|
||||||
|
c_strides1 = torch.full((num_experts, ),
|
||||||
|
2 * n,
|
||||||
|
device="cuda",
|
||||||
|
dtype=torch.int64)
|
||||||
|
ab_strides2 = torch.full((num_experts, ),
|
||||||
|
n,
|
||||||
|
device="cuda",
|
||||||
|
dtype=torch.int64)
|
||||||
|
c_strides2 = torch.full((num_experts, ),
|
||||||
|
k,
|
||||||
|
device="cuda",
|
||||||
|
dtype=torch.int64)
|
||||||
|
|
||||||
|
for expert in range(num_experts):
|
||||||
|
w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert])
|
||||||
|
w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert])
|
||||||
|
w1_q_notransp = w1_q.clone()
|
||||||
|
w2_q_notransp = w2_q.clone()
|
||||||
|
w1_q = w1_q.transpose(1, 2)
|
||||||
|
w2_q = w2_q.transpose(1, 2)
|
||||||
|
|
||||||
|
score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
|
||||||
|
|
||||||
|
topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
|
||||||
|
|
||||||
|
def run_triton_moe(a: torch.Tensor, w1: torch.Tensor, w2: torch.Tensor,
|
||||||
|
topk_weights: torch.Tensor, topk_ids: torch.Tensor,
|
||||||
|
w1_scale: torch.Tensor, w2_scale: torch.Tensor,
|
||||||
|
a_scale: torch.Tensor, num_repeats: int):
|
||||||
|
for _ in range(num_repeats):
|
||||||
|
fused_experts(a,
|
||||||
|
w1,
|
||||||
|
w2,
|
||||||
|
topk_weights,
|
||||||
|
topk_ids,
|
||||||
|
use_fp8_w8a8=True,
|
||||||
|
w1_scale=w1_scale,
|
||||||
|
w2_scale=w2_scale,
|
||||||
|
a1_scale=a_scale)
|
||||||
|
|
||||||
|
def run_cutlass_moe(a: torch.Tensor, a_scale: torch.Tensor,
|
||||||
|
w1: torch.Tensor, w2: torch.Tensor,
|
||||||
|
w1_scale: torch.Tensor, w2_scale: torch.Tensor,
|
||||||
|
topk_weights: torch.Tensor, topk_ids: torch.Tensor,
|
||||||
|
ab_strides1: torch.Tensor, c_strides1: torch.Tensor,
|
||||||
|
ab_strides2: torch.Tensor, c_strides2: torch.Tensor,
|
||||||
|
num_repeats: int):
|
||||||
|
for _ in range(num_repeats):
|
||||||
|
cutlass_moe_fp8(a,
|
||||||
|
w1,
|
||||||
|
w2,
|
||||||
|
w1_scale,
|
||||||
|
w2_scale,
|
||||||
|
topk_weights,
|
||||||
|
topk_ids,
|
||||||
|
ab_strides1,
|
||||||
|
c_strides1,
|
||||||
|
ab_strides2,
|
||||||
|
c_strides2,
|
||||||
|
a1_scale=a_scale)
|
||||||
|
|
||||||
|
def run_cutlass_from_graph(
|
||||||
|
a: torch.Tensor, a_scale: torch.Tensor, w1_q: torch.Tensor,
|
||||||
|
w2_q: torch.Tensor, w1_scale: torch.Tensor, w2_scale: torch.Tensor,
|
||||||
|
topk_weights: torch.Tensor, topk_ids: torch.Tensor,
|
||||||
|
ab_strides1: torch.Tensor, c_strides1: torch.Tensor,
|
||||||
|
ab_strides2: torch.Tensor, c_strides2: torch.Tensor):
|
||||||
|
with set_current_vllm_config(
|
||||||
|
VllmConfig(parallel_config=ParallelConfig(
|
||||||
|
pipeline_parallel_size=1))):
|
||||||
|
return cutlass_moe_fp8(a,
|
||||||
|
w1_q,
|
||||||
|
w2_q,
|
||||||
|
w1_scale,
|
||||||
|
w2_scale,
|
||||||
|
topk_weights,
|
||||||
|
topk_ids,
|
||||||
|
ab_strides1,
|
||||||
|
c_strides1,
|
||||||
|
ab_strides2,
|
||||||
|
c_strides2,
|
||||||
|
a1_scale=a_scale)
|
||||||
|
|
||||||
|
def run_triton_from_graph(a: torch.Tensor, w1: torch.Tensor,
|
||||||
|
w2: torch.Tensor, topk_weights: torch.Tensor,
|
||||||
|
topk_ids: torch.Tensor, w1_scale: torch.Tensor,
|
||||||
|
w2_scale: torch.Tensor, a_scale: torch.Tensor):
|
||||||
|
with set_current_vllm_config(
|
||||||
|
VllmConfig(parallel_config=ParallelConfig(
|
||||||
|
pipeline_parallel_size=1))):
|
||||||
|
return fused_experts(a,
|
||||||
|
w1,
|
||||||
|
w2,
|
||||||
|
topk_weights,
|
||||||
|
topk_ids,
|
||||||
|
use_fp8_w8a8=True,
|
||||||
|
w1_scale=w1_scale,
|
||||||
|
w2_scale=w2_scale,
|
||||||
|
a1_scale=a_scale)
|
||||||
|
|
||||||
|
def replay_graph(graph, num_repeats):
|
||||||
|
for _ in range(num_repeats):
|
||||||
|
graph.replay()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
cutlass_stream = torch.cuda.Stream()
|
||||||
|
cutlass_graph = torch.cuda.CUDAGraph()
|
||||||
|
with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
|
||||||
|
run_cutlass_from_graph(a, a_scale, w1_q, w2_q, w1_scale, w2_scale,
|
||||||
|
topk_weights, topk_ids, ab_strides1, c_strides1,
|
||||||
|
ab_strides2, c_strides2)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
triton_stream = torch.cuda.Stream()
|
||||||
|
triton_graph = torch.cuda.CUDAGraph()
|
||||||
|
with torch.cuda.graph(triton_graph, stream=triton_stream):
|
||||||
|
run_triton_from_graph(a, w1_q_notransp, w2_q_notransp, topk_weights,
|
||||||
|
topk_ids, w1_scale, w2_scale, a_scale)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
min_run_time = 5
|
||||||
|
num_warmup = 5
|
||||||
|
num_runs = 25
|
||||||
|
|
||||||
|
globals = {
|
||||||
|
# Baseline params
|
||||||
|
"w1": w1,
|
||||||
|
"w2": w2,
|
||||||
|
"score": score,
|
||||||
|
"topk": topk,
|
||||||
|
"w1_q_notransp": w1_q_notransp,
|
||||||
|
"w2_q_notransp": w2_q_notransp,
|
||||||
|
# Cutlass params
|
||||||
|
"a_scale": a_scale,
|
||||||
|
"w1_q": w1_q,
|
||||||
|
"w2_q": w2_q,
|
||||||
|
"w1_scale": w1_scale,
|
||||||
|
"w2_scale": w2_scale,
|
||||||
|
"ab_strides1": ab_strides1,
|
||||||
|
"c_strides1": c_strides1,
|
||||||
|
"ab_strides2": ab_strides2,
|
||||||
|
"c_strides2": c_strides2,
|
||||||
|
# cuda graph params
|
||||||
|
"cutlass_graph": cutlass_graph,
|
||||||
|
"triton_graph": triton_graph,
|
||||||
|
# Gen params
|
||||||
|
"a": a,
|
||||||
|
"topk_weights": topk_weights,
|
||||||
|
"topk_ids": topk_ids,
|
||||||
|
"num_runs": num_runs,
|
||||||
|
# Kernels
|
||||||
|
"run_triton_moe": run_triton_moe,
|
||||||
|
"run_cutlass_moe": run_cutlass_moe,
|
||||||
|
"replay_graph": replay_graph,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Warmup
|
||||||
|
run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids,
|
||||||
|
w1_scale, w2_scale, a_scale, num_warmup)
|
||||||
|
|
||||||
|
results.append(
|
||||||
|
benchmark.Timer(
|
||||||
|
stmt=
|
||||||
|
"run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501
|
||||||
|
globals=globals,
|
||||||
|
label=label,
|
||||||
|
sub_label=sub_label,
|
||||||
|
description="triton_moe",
|
||||||
|
).blocked_autorange(min_run_time=min_run_time))
|
||||||
|
|
||||||
|
# Warmup
|
||||||
|
replay_graph(triton_graph, num_warmup)
|
||||||
|
|
||||||
|
results.append(
|
||||||
|
benchmark.Timer(
|
||||||
|
stmt="replay_graph(triton_graph, num_runs)",
|
||||||
|
globals=globals,
|
||||||
|
label=label,
|
||||||
|
sub_label=sub_label,
|
||||||
|
description="triton_moe_cuda_graphs",
|
||||||
|
).blocked_autorange(min_run_time=min_run_time))
|
||||||
|
|
||||||
|
# Warmup
|
||||||
|
run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights,
|
||||||
|
topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2,
|
||||||
|
num_warmup)
|
||||||
|
|
||||||
|
results.append(
|
||||||
|
benchmark.Timer(
|
||||||
|
stmt=
|
||||||
|
"run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)", # noqa: E501
|
||||||
|
globals=globals,
|
||||||
|
label=label,
|
||||||
|
sub_label=sub_label,
|
||||||
|
description="grouped_gemm_moe",
|
||||||
|
).blocked_autorange(min_run_time=min_run_time))
|
||||||
|
|
||||||
|
# Warmup
|
||||||
|
replay_graph(cutlass_graph, num_warmup)
|
||||||
|
|
||||||
|
results.append(
|
||||||
|
benchmark.Timer(
|
||||||
|
stmt="replay_graph(cutlass_graph, num_runs)",
|
||||||
|
globals=globals,
|
||||||
|
label=label,
|
||||||
|
sub_label=sub_label,
|
||||||
|
description="grouped_gemm_moe_cuda_graphs",
|
||||||
|
).blocked_autorange(min_run_time=min_run_time))
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
print("Benchmarking models:")
|
||||||
|
for i, model in enumerate(args.models):
|
||||||
|
print(f"[{i}] {model}")
|
||||||
|
|
||||||
|
results: list[benchmark.Measurement] = []
|
||||||
|
|
||||||
|
for model in args.models:
|
||||||
|
for tp in args.tp_sizes:
|
||||||
|
for layer in WEIGHT_SHAPES_MOE[model]:
|
||||||
|
num_experts = layer[0]
|
||||||
|
topk = layer[1]
|
||||||
|
size_k = layer[2]
|
||||||
|
size_n = layer[3] // tp
|
||||||
|
|
||||||
|
if len(args.limit_k) > 0 and size_k not in args.limit_k:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(args.limit_n) > 0 and size_n not in args.limit_n:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for per_act_token in PER_ACT_TOKEN_OPTS:
|
||||||
|
for per_out_ch in PER_OUT_CH_OPTS:
|
||||||
|
for size_m in DEFAULT_BATCH_SIZES:
|
||||||
|
mkn = (size_m, size_k, size_n)
|
||||||
|
bench_run(results, model, num_experts, topk,
|
||||||
|
per_act_token, per_out_ch, mkn)
|
||||||
|
|
||||||
|
compare = benchmark.Compare(results)
|
||||||
|
compare.print()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = FlexibleArgumentParser(
|
||||||
|
description="Benchmark Marlin across specified models/shapes/batches")
|
||||||
|
parser.add_argument(
|
||||||
|
"--models",
|
||||||
|
nargs="+",
|
||||||
|
type=str,
|
||||||
|
default=DEFAULT_MODELS,
|
||||||
|
choices=WEIGHT_SHAPES_MOE.keys(),
|
||||||
|
)
|
||||||
|
parser.add_argument("--tp-sizes",
|
||||||
|
nargs="+",
|
||||||
|
type=int,
|
||||||
|
default=DEFAULT_TP_SIZES)
|
||||||
|
parser.add_argument("--batch-sizes",
|
||||||
|
nargs="+",
|
||||||
|
type=int,
|
||||||
|
default=DEFAULT_BATCH_SIZES)
|
||||||
|
parser.add_argument("--limit-k", nargs="+", type=int, default=[])
|
||||||
|
parser.add_argument("--limit-n", nargs="+", type=int, default=[])
|
||||||
|
parser.add_argument("--limit-num-groups", nargs="+", type=int, default=[])
|
||||||
|
parser.add_argument("--limit-per-act-token",
|
||||||
|
nargs="+",
|
||||||
|
type=int,
|
||||||
|
default=[])
|
||||||
|
parser.add_argument("--limit-per-out-ch", nargs="+", type=int, default=[])
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args)
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user