mirror of
https://github.com/vllm-project/vllm.git
synced 2025-11-19 01:04:47 +08:00
Compare commits
970 Commits
codex/remo
...
v0.11.1rc7
| Author | SHA1 | Date | |
|---|---|---|---|
| f67299f66d | |||
| 5f6666fb5a | |||
| 66a62d73da | |||
| c505dd6b61 | |||
| f7adf64aac | |||
| 240d6b1758 | |||
| b315ba9052 | |||
| 9b24cf6f47 | |||
| facbc2c21e | |||
| e2fd9a2edf | |||
| 1326f17492 | |||
| caf412e593 | |||
| a035b5cffb | |||
| 5b4dcecdd7 | |||
| 609bb244bd | |||
| 3a9ea77c35 | |||
| 28a82bb5e6 | |||
| 2a21f3e7c2 | |||
| ab625ba2fc | |||
| 324c8cbd79 | |||
| 75ecaf48fe | |||
| 30700b1cd7 | |||
| 4b94ed8f92 | |||
| 6dec9f6109 | |||
| bf6a3d0ff5 | |||
| 40d33264c6 | |||
| 9c84ca8293 | |||
| 6d54336ae5 | |||
| 34553b9d27 | |||
| b039bfda8f | |||
| d0e186c16f | |||
| f080a83511 | |||
| 40e2eeeb92 | |||
| b06b9470ca | |||
| 4673e465ff | |||
| 912744d066 | |||
| 15be507c86 | |||
| 6f7de33bed | |||
| a98cc35c34 | |||
| e8697faf03 | |||
| 03fa4d3fb3 | |||
| 6b2b9fd934 | |||
| c5f685b3ae | |||
| c4768dcf47 | |||
| a65a934ebe | |||
| 4a8d6bd168 | |||
| 636efd10a5 | |||
| 289eb6c537 | |||
| 19d91ece4b | |||
| 7ae5a5fb11 | |||
| de2b78305f | |||
| e5e9067e61 | |||
| 3a7d580343 | |||
| 05f8d69077 | |||
| 404d7a9d14 | |||
| 171133f929 | |||
| 32787d0644 | |||
| 975676d174 | |||
| 77d702a22b | |||
| 2108a571d7 | |||
| 47604137a2 | |||
| 26990d25dc | |||
| d9ab1ad9d1 | |||
| 608bb14462 | |||
| 4a36681f85 | |||
| d15afc1fd0 | |||
| 934a9c3b79 | |||
| 70af44fd10 | |||
| 781f5ebf52 | |||
| 0852527647 | |||
| 61d25dc44b | |||
| d0c7792004 | |||
| b158df2813 | |||
| 1aaecda078 | |||
| 811df41ee9 | |||
| 67a2da890e | |||
| da786e339e | |||
| 18903216f5 | |||
| d0ceb38ae8 | |||
| 155ad56d7b | |||
| 5fb4137c99 | |||
| 68a72a5cc1 | |||
| 0f872b7977 | |||
| 4b1ff13221 | |||
| e0d6b4a867 | |||
| 72b1c2ae2c | |||
| e0919f331d | |||
| 8e19d470af | |||
| 1958bda9b4 | |||
| 7bdb42b2f2 | |||
| 315068eb4a | |||
| ccd98b59c1 | |||
| 21b82f4ea2 | |||
| a736e5ff77 | |||
| 9da9208b20 | |||
| 11fd69dd54 | |||
| c0a4b95d64 | |||
| a47d94f18c | |||
| e70fbc599b | |||
| 4bf56c79cc | |||
| 59b453eaa2 | |||
| 827e4237bc | |||
| ca6f755d24 | |||
| ca90f50304 | |||
| da855b42d2 | |||
| 449de9001a | |||
| d4aa65c998 | |||
| 7a8375f8a0 | |||
| 5e0c1fe69c | |||
| 4507a6dae4 | |||
| d1dd5f53e4 | |||
| e52e4da971 | |||
| 2176778cd3 | |||
| 0370679ce9 | |||
| 8816e375d3 | |||
| f32229293e | |||
| c757a15f0f | |||
| 59a50afa08 | |||
| 981cadb35c | |||
| c3ee80a01a | |||
| 3755c14532 | |||
| 201dc98acc | |||
| a404e2c0f1 | |||
| e31946f86e | |||
| bde5039325 | |||
| d72299d47b | |||
| 80679f108f | |||
| 43ecd0a900 | |||
| 07d614511f | |||
| f948ab6945 | |||
| d71af5f502 | |||
| 90189c71a9 | |||
| d79d9f0780 | |||
| b6a248bdd7 | |||
| 1767658559 | |||
| efe73e9b57 | |||
| 0b8e871e5e | |||
| 5ee93a5956 | |||
| e15601789b | |||
| 65ac8d8dc4 | |||
| ffb08379d8 | |||
| e04492449e | |||
| 518ec6b722 | |||
| 802748bddb | |||
| faedbb4d4f | |||
| 40db194446 | |||
| c765f0b443 | |||
| 002b07c4b2 | |||
| 752ddeacaa | |||
| c18f88c6ca | |||
| 6fd0df8132 | |||
| 3f5a4b6473 | |||
| 6cae1e5332 | |||
| 80c9275348 | |||
| e50c454672 | |||
| 5d16d0fa62 | |||
| 0606bea2b6 | |||
| 6e97eccf5d | |||
| 6ab183813c | |||
| 6b7a81185d | |||
| b57789b62b | |||
| 377061d481 | |||
| 86dca07d9b | |||
| 16b37f3119 | |||
| 0976711f3b | |||
| e261d37c9a | |||
| b7cbc25416 | |||
| d43ad5a757 | |||
| 0ff05e3770 | |||
| 428bc7bf1c | |||
| 878fd5a16f | |||
| 18b39828d9 | |||
| 4ea62b77f5 | |||
| d4e547bb7e | |||
| 2d977a7a9e | |||
| 1fb4217a05 | |||
| 611c86ea3c | |||
| dc937175d4 | |||
| 2f1cc8cef1 | |||
| 938a81692e | |||
| c9f66da8fd | |||
| 05cae69f0f | |||
| 5fd8f02ea9 | |||
| 97e3dda84b | |||
| 5a0a6dfd55 | |||
| 938772af03 | |||
| e4ee658672 | |||
| 77f8001f53 | |||
| 300a265978 | |||
| 03c4c4aa9d | |||
| 2ec401bc39 | |||
| 4022a9d279 | |||
| 53f6e81dfd | |||
| 43a6acfb7d | |||
| 58279c60b5 | |||
| 2f84ae1f27 | |||
| f32cbc9a0c | |||
| 7e4be74104 | |||
| 380ba6816d | |||
| 14a125a06d | |||
| c02fccdbd2 | |||
| 6ddae74054 | |||
| b13a447546 | |||
| 7956b0c0bc | |||
| 3758757377 | |||
| ccd3e55e51 | |||
| 01baefe674 | |||
| 786030721e | |||
| 145c00a4d3 | |||
| 55011aef24 | |||
| a4398fbb5e | |||
| 2c19d96777 | |||
| 4bc400f47e | |||
| cac4c10ef0 | |||
| f7d2946e99 | |||
| 294c805f1d | |||
| 40b69e33e7 | |||
| 32257297dd | |||
| ba464e6ae2 | |||
| 7f4bdadb92 | |||
| cec7c28833 | |||
| 18961c5ea6 | |||
| 470ad118b6 | |||
| 1bf43ae35d | |||
| 0ce743f4e1 | |||
| 6c317a656e | |||
| 00b31a36a2 | |||
| 73444b7b56 | |||
| 853a8eb53b | |||
| 758ea2e980 | |||
| 685c99ee77 | |||
| 1e88fb751b | |||
| c2ed069b32 | |||
| af6e19f50f | |||
| 99d69af9ec | |||
| d811b442d3 | |||
| 30a14b034f | |||
| 799ce45cc1 | |||
| 2c0c7c39bd | |||
| e675118849 | |||
| e2347dbf58 | |||
| 879a06579e | |||
| 29de3cdee4 | |||
| 7e2729b57e | |||
| 3a5de7d2d6 | |||
| bc4486d609 | |||
| 0cdbe7b744 | |||
| df334868ca | |||
| 0e0a638c3b | |||
| f29aeb5a25 | |||
| 5e8862e9e0 | |||
| 9e5bd3076e | |||
| fc16f1c477 | |||
| bc306fe5e9 | |||
| 103a468bbf | |||
| 70bfbd7b16 | |||
| d6517be3cd | |||
| 7e06c40e63 | |||
| 675704ac01 | |||
| 0384aa7150 | |||
| 3857eb8725 | |||
| 933cdea440 | |||
| 3933f18a5e | |||
| e5ef4dfc11 | |||
| 36960501d3 | |||
| b2e65cb4a7 | |||
| 2bf0bcc1fc | |||
| 697f507a8e | |||
| d5d2a0fe74 | |||
| c9791f1813 | |||
| e7acb20076 | |||
| 4b68c4a55b | |||
| a8141fa649 | |||
| 4917002523 | |||
| a2981c4272 | |||
| 4574d48bab | |||
| ab98f6556f | |||
| 2918c1b49c | |||
| 1004205795 | |||
| ba33e8830d | |||
| 33a0ea5f32 | |||
| 60f76baa66 | |||
| e5e076cad7 | |||
| eebf00cb0c | |||
| 9956aae4ea | |||
| 0fe0140408 | |||
| 4e68cc9b6a | |||
| 1994de99ea | |||
| 4464723f22 | |||
| 74374386e2 | |||
| c01f6e525f | |||
| c7d2a554ba | |||
| af826e0820 | |||
| e806178d2a | |||
| 5be1bed790 | |||
| 31b55ffc62 | |||
| ded8ada86a | |||
| 8bff831f0a | |||
| b5d70751d8 | |||
| b8c48c5d72 | |||
| 17d055f527 | |||
| 2ce5c5d3d6 | |||
| b5bae42f91 | |||
| d7fb10c574 | |||
| b798e39f93 | |||
| 48eb8eba58 | |||
| b5d90f7400 | |||
| d4aa144343 | |||
| fcb1d570bb | |||
| accb8fab07 | |||
| 5b0448104f | |||
| f7a6682872 | |||
| a9fe0793f2 | |||
| 7568a282b9 | |||
| 1da3309ace | |||
| 5522fb274b | |||
| 0f95a1c3f2 | |||
| ded24e3e54 | |||
| d6704dd099 | |||
| ecca3fee76 | |||
| 9a0d2f0d92 | |||
| ad3ec89532 | |||
| 3481e40743 | |||
| 5e72216d17 | |||
| 1a33aacf82 | |||
| 7ba6aa8f56 | |||
| ab2eb27b74 | |||
| 3c7fefdeba | |||
| 1891cf605a | |||
| 8df98c2161 | |||
| 4fb8771cc0 | |||
| 413ef7a3b4 | |||
| 8b62495076 | |||
| 83fd49b1fc | |||
| a4a4f0f617 | |||
| 0d8161b075 | |||
| d2c33c397a | |||
| f6d5f5888c | |||
| 9007bf57e6 | |||
| f257544709 | |||
| 0b51c9bd8b | |||
| d3ab240f39 | |||
| 94666612a9 | |||
| 4fe5895361 | |||
| 111faf1118 | |||
| 6afc28a9ba | |||
| 141e6a0505 | |||
| 130aa8cbcf | |||
| e3d8186666 | |||
| f5710ef02a | |||
| a8c02fb5bf | |||
| 02af36df36 | |||
| e88bdd60d9 | |||
| 05e034f085 | |||
| 936643a868 | |||
| b186149e8e | |||
| 2abbd351ef | |||
| 446912d1cb | |||
| a00d6254e9 | |||
| 05181cc57f | |||
| 259504e147 | |||
| 0484b64248 | |||
| f58d9b6404 | |||
| 44b5ce956d | |||
| 7a865f2325 | |||
| 2fa90bda27 | |||
| 0291fbf65c | |||
| b46e4a06f1 | |||
| d34f5fe939 | |||
| bdb01a38fe | |||
| 5b3c35a68e | |||
| 61fbfe5274 | |||
| 255e34ca50 | |||
| a8d2e326ec | |||
| 53a56e658b | |||
| 69f064062b | |||
| 921e78f4bb | |||
| 6ebffafbb6 | |||
| 3b96f85c36 | |||
| 23ad820553 | |||
| 5d3be3ba4c | |||
| 4f882be4a0 | |||
| 9273754222 | |||
| f4e8154076 | |||
| a663f6ae64 | |||
| a4fc21895e | |||
| a3e8611da5 | |||
| 7c2bdb83dc | |||
| 9932ed6a83 | |||
| 2d631d28c6 | |||
| b368382964 | |||
| a806c14cc7 | |||
| 181bf5bbde | |||
| cbd5e07a51 | |||
| 63b22e0dbb | |||
| 5980604c44 | |||
| 361a7463d3 | |||
| 720af6ab79 | |||
| 55cba4a05c | |||
| c7abff2990 | |||
| 71b1c8b667 | |||
| 8fb7b2fab9 | |||
| be7b55a83d | |||
| 315b860abe | |||
| 87c41c26ad | |||
| 65d2cf9511 | |||
| d63cd9ff10 | |||
| 66a168a197 | |||
| a99564ac5b | |||
| 4c5f632165 | |||
| b853540388 | |||
| 56ed7609a9 | |||
| 29c9cb8007 | |||
| 83f478bb19 | |||
| 269c4db0a4 | |||
| 52efc34ebf | |||
| d95d0f4b98 | |||
| 0402428200 | |||
| 17af6aa0da | |||
| fc168c33f3 | |||
| acc78aeb88 | |||
| 0f67d4d962 | |||
| 7e1d697b56 | |||
| 699d62e6cf | |||
| cd390b609d | |||
| 2080b05099 | |||
| 6454afec90 | |||
| 41a62564a7 | |||
| 284cc92275 | |||
| 435be10db9 | |||
| b7030d962b | |||
| 3567816932 | |||
| e0ef8a2920 | |||
| 42efe609ba | |||
| 88d3141ec6 | |||
| 09a6a49eaf | |||
| 074475541a | |||
| d4c574c39f | |||
| c528b9006a | |||
| 85fee74b33 | |||
| 8dbe0c527f | |||
| 5cc6bddb6e | |||
| 1f9460c4c1 | |||
| 70022ffc00 | |||
| f417746ad7 | |||
| 0552cfb195 | |||
| 51dd14ac2b | |||
| dbfbf9f324 | |||
| ca76486a16 | |||
| a9f55dc588 | |||
| 81d5bb765a | |||
| 0825197bee | |||
| 9ef3d5b875 | |||
| 295c7f0267 | |||
| 3fa2c12185 | |||
| fe2016de2d | |||
| 237cf6d32a | |||
| faee3ccdc2 | |||
| 570c3e1cd4 | |||
| 3a4255c7c4 | |||
| 61089465a6 | |||
| 88afa11010 | |||
| d00ce29d89 | |||
| 3b7bdf983b | |||
| 50b788a17a | |||
| fc059c7061 | |||
| bfb240cc49 | |||
| e255d92990 | |||
| 3729ed00ba | |||
| 6644796bf4 | |||
| ff93cc8c84 | |||
| 243ed7d32e | |||
| 7e0941055f | |||
| 6738e4a093 | |||
| 2566dca2a9 | |||
| b4fda58a2d | |||
| a0003b56b0 | |||
| 5beacce2ea | |||
| 8669c69afa | |||
| 1651003c35 | |||
| 1cb8c6c5fe | |||
| e05a6754a8 | |||
| 084a9dae80 | |||
| c9461e05a4 | |||
| 4dfdb821c8 | |||
| 58fab50d82 | |||
| db6f28d898 | |||
| 14e2f1231e | |||
| 7c4767f1eb | |||
| 9771e0b432 | |||
| 980de31ca0 | |||
| 1c160841ea | |||
| 4ca13a8667 | |||
| 675aa2ec64 | |||
| 3ae082c373 | |||
| 49c00fe304 | |||
| 141d3b9fc5 | |||
| abf3db40ef | |||
| 8e4ca4d14e | |||
| 1a0f4defb7 | |||
| 843af7f7fc | |||
| 1f633b8632 | |||
| a4c29e6e82 | |||
| 8f18feb191 | |||
| ed540d6d4c | |||
| f6027b2855 | |||
| ab3e80042e | |||
| ceacedc1f9 | |||
| bfa59be8f1 | |||
| 265ecb05fb | |||
| 09a7e6f617 | |||
| 6c2eef5a5d | |||
| 19748806f0 | |||
| 4a8a567e16 | |||
| 344a0017c0 | |||
| becb7de40b | |||
| 250fb1b8ea | |||
| 647214f3d5 | |||
| ddeec11ba9 | |||
| 86ed77022d | |||
| aa1356ec53 | |||
| ecc3c0940a | |||
| ba09652de2 | |||
| bd66b8529b | |||
| 6c728f7771 | |||
| 80e9452984 | |||
| c3a2c6ac5f | |||
| 72f431e709 | |||
| be4445072c | |||
| f381cf2302 | |||
| 5ff5d94e77 | |||
| f95da13c3d | |||
| aef368aa08 | |||
| 5f6cbf60d6 | |||
| 3ada34f9cb | |||
| 0eb8f2b880 | |||
| 163965d183 | |||
| a03cf9bc70 | |||
| 352c0c8a28 | |||
| bfe0b4bd2a | |||
| 58fbbcb2f5 | |||
| 87778d5f00 | |||
| f9e7ad5400 | |||
| 4d0f266113 | |||
| e93ff6c8b9 | |||
| 1c691f4a71 | |||
| 9fce7bee74 | |||
| b63f2143f8 | |||
| f32bf7582e | |||
| 8a81d776ce | |||
| f6fdacd82c | |||
| d31f7844f8 | |||
| 7a6c8c3fa1 | |||
| 221bf72577 | |||
| b3aba04e5a | |||
| 8a297115e2 | |||
| 191eed0bb9 | |||
| fb860670da | |||
| 83e760c57d | |||
| c2bba69065 | |||
| e133d6d218 | |||
| a1946c9f61 | |||
| 9f020f4f31 | |||
| 3b45075206 | |||
| 168e578efc | |||
| 6ac5e06f7c | |||
| 5c2acb270a | |||
| b26b70bec4 | |||
| ab4be40fc5 | |||
| 245e4f2c01 | |||
| 1d165d6d85 | |||
| 83004020fd | |||
| 12e21701e7 | |||
| 30a33b92ee | |||
| 7c572544e4 | |||
| c312320764 | |||
| c981f0ea78 | |||
| 6367bde739 | |||
| f50cc221ea | |||
| acedc74b1a | |||
| d29483b58a | |||
| 950cf9e58e | |||
| 3125d79950 | |||
| e33ee23ee3 | |||
| b10c64c834 | |||
| 0925b28a8e | |||
| 99722d5f0e | |||
| 4c91a28e30 | |||
| b038d9c40c | |||
| 2ba60ec7fe | |||
| bd7157a071 | |||
| be429d0cfd | |||
| c253745eb8 | |||
| daec4d2624 | |||
| 6c9fdbf725 | |||
| 483ea64611 | |||
| e20eba753b | |||
| bbc1b29665 | |||
| acb1bfa601 | |||
| 75c7ad9918 | |||
| 5550ff9c25 | |||
| 3aeb19a39e | |||
| 8c017b3490 | |||
| 9c2c2287a0 | |||
| fec2b341ad | |||
| 87bc0c492f | |||
| fe3b9372ad | |||
| bde9e2272a | |||
| 08405609cc | |||
| ab81379ea6 | |||
| 4ffd6e8942 | |||
| 965c5f4914 | |||
| 4d055ef465 | |||
| 17c540a993 | |||
| 4d4d6bad19 | |||
| 11ae016bd7 | |||
| 41d3071918 | |||
| fb5e10d3fb | |||
| b2f78cbad4 | |||
| 23583ee28c | |||
| 01c977e96d | |||
| b3dda72c23 | |||
| fb0571b077 | |||
| 2ed8b6b3d0 | |||
| 013abde6ef | |||
| a5464dcf92 | |||
| ac3ed5a815 | |||
| e6ba2000ae | |||
| aa255ff55a | |||
| 7bb736d00e | |||
| 9f4e30904b | |||
| 5afd3276df | |||
| 43721bc67f | |||
| 02d709a6f1 | |||
| 4a510ab487 | |||
| 314fa8abbf | |||
| 334535b6fb | |||
| dcbb3f1871 | |||
| 00417f4e44 | |||
| ed344f4116 | |||
| e51928793e | |||
| d2740fafbf | |||
| 17838e50ef | |||
| 44c8555621 | |||
| f7d318de2b | |||
| 76f0d05bc6 | |||
| 7d8975de84 | |||
| 785d8b6410 | |||
| f6cdc9a02f | |||
| 509cdc0370 | |||
| 9b6504c307 | |||
| e19b16dde6 | |||
| 582f2c6be7 | |||
| f8a0acbdbe | |||
| 1317034379 | |||
| 0ecc553ee6 | |||
| f96bc3649c | |||
| 938c43ea7f | |||
| 0a9ef0cfce | |||
| e5b438a247 | |||
| 0b99f5d302 | |||
| 1f491aa0c8 | |||
| de92d916fe | |||
| a1063628a4 | |||
| d796375258 | |||
| 14f8456344 | |||
| 4794c2bd92 | |||
| d3cbaa08dc | |||
| 828523ad8e | |||
| 136a17fe6e | |||
| f57438338d | |||
| 5d598680e3 | |||
| 8f4b313c37 | |||
| f93e348010 | |||
| f54f85129e | |||
| d4d1a6024f | |||
| db1764e4e0 | |||
| 7f83b4ee8e | |||
| 5c3bae1a6a | |||
| 5210dc3940 | |||
| 650b51f9f9 | |||
| 6256697997 | |||
| 71557a5f7c | |||
| f3c378ffa7 | |||
| f5ed68ef63 | |||
| efdef57b1f | |||
| b8a4572157 | |||
| 302ef403a2 | |||
| 8865da157b | |||
| f0862eae43 | |||
| 8c851f6d04 | |||
| 7cfa420f49 | |||
| a27b288e4a | |||
| e471d7ca7e | |||
| c43ca8259e | |||
| 85a65e7f51 | |||
| a2986b3e33 | |||
| 96b9aa5aa0 | |||
| e66d787bce | |||
| bfad142e25 | |||
| 9354660036 | |||
| 07ca70af8d | |||
| 2dcd12d357 | |||
| 579d2e5458 | |||
| 0512c04aee | |||
| 7e0ef4084a | |||
| 4aed506b65 | |||
| a86b4c58e8 | |||
| ff4810ba73 | |||
| 9d6964926e | |||
| 0e65818910 | |||
| 380f17527c | |||
| b92ab3deda | |||
| acaa2c0a4a | |||
| 82af928c41 | |||
| 87efc681db | |||
| c3a722fcb2 | |||
| aba48f7db1 | |||
| 04b5f9802d | |||
| efc8f7d814 | |||
| 6d87a2838c | |||
| e6cdbd6792 | |||
| df850c4912 | |||
| 720394de43 | |||
| 88a49745af | |||
| ca683a2a72 | |||
| e9f1b8c9e9 | |||
| ea97940d6c | |||
| fdd32750f0 | |||
| c715ba3735 | |||
| 9c4cb68339 | |||
| 780eb03d9b | |||
| ef9676a1f1 | |||
| 70b1b330e1 | |||
| d1d063a588 | |||
| 7e6edb1469 | |||
| 74704d4553 | |||
| d2f816d6ff | |||
| 577d498212 | |||
| fd85c9f426 | |||
| d32c611f45 | |||
| 01ad27faff | |||
| 481545b397 | |||
| d3cc8427c0 | |||
| 4821ac1b4d | |||
| 4497c8f821 | |||
| 2e36cdbe2b | |||
| fe3edb4cf0 | |||
| 29350922c6 | |||
| 8ae169286f | |||
| 8a0af6a561 | |||
| cfded80793 | |||
| b59dd19b55 | |||
| 3e051bda82 | |||
| 8317f72354 | |||
| d8bebb008a | |||
| 35bc22f23c | |||
| fa96fb9c70 | |||
| e3fdb627d9 | |||
| 7200a21cd1 | |||
| 577c72a227 | |||
| 314285d4f2 | |||
| d2a7938582 | |||
| 89342ce4c0 | |||
| f89f599395 | |||
| e251e457c5 | |||
| afc47e4de7 | |||
| e3b90c1ba2 | |||
| 134f70b3ed | |||
| a1b2d658ee | |||
| 5c7fe25491 | |||
| 53c9a7cee2 | |||
| 0d21b9b51e | |||
| 10214b6935 | |||
| 4a61950f4d | |||
| 3263799056 | |||
| 8e67b2557a | |||
| 4073c82c4e | |||
| 767c3ab869 | |||
| 4f207c7174 | |||
| 782505ed8e | |||
| 98f30b8cba | |||
| 3cd36660f7 | |||
| 46ad73955a | |||
| 41f3884438 | |||
| 60e419c1ee | |||
| 7ef6052804 | |||
| 4fca1a1bd2 | |||
| a6049be73c | |||
| 18ed7746ea | |||
| 8fcaaf6a16 | |||
| 9bb38130cb | |||
| b91d8db873 | |||
| 045b396d09 | |||
| 76852017ea | |||
| 82e64c7a20 | |||
| 4ca204055e | |||
| c5c8f5ea59 | |||
| 01653a917b | |||
| 0cd103e7cb | |||
| 5be7ca1b99 | |||
| f0a30a067b | |||
| 9d6cff3ede | |||
| a25f2adee9 | |||
| d0bed837ac | |||
| f7ee69868a | |||
| d2a71530c1 | |||
| 086609de64 | |||
| 727144bed1 | |||
| 55392bc879 | |||
| ddaff2938e | |||
| 27ed39a347 | |||
| 8f8474fbe3 | |||
| be067861c6 | |||
| 5bc26c438d | |||
| eef921f45e | |||
| e317414ce1 | |||
| 949cb0170d | |||
| e94cfd51da | |||
| 7c12763b24 | |||
| 3b780a4bbb | |||
| 30f78af147 | |||
| 19a9b169bf | |||
| 96ad65b7fe | |||
| 8d2b8c0ff2 | |||
| b2155ed317 | |||
| 910abdbd08 | |||
| cddce79fda | |||
| e519281920 | |||
| 7b03584de8 | |||
| ae9d0e7da5 | |||
| 0e67102d93 | |||
| f4ba2061cf | |||
| 1e6848a65d | |||
| 67661375fa | |||
| 213b64452a | |||
| 784c231151 | |||
| 606b00e80f | |||
| 720d3cd0f0 | |||
| ab196edefb | |||
| 3ee202ea1e | |||
| ad430a67ca | |||
| 6f0f570c43 | |||
| b545a0b207 | |||
| 29255cfc3b | |||
| da4455609d | |||
| aafb99a4d4 | |||
| 757fa4a4da | |||
| c6187f55f7 | |||
| 8983e0216f | |||
| 1ee35382cb | |||
| 6e783bc54b | |||
| c9d33c60dc | |||
| 2e54db4d2b | |||
| 44f633dba1 | |||
| a462331e36 | |||
| 4069db3f2e | |||
| 0d37450eb7 | |||
| 47e66c24e2 | |||
| 3b736e1c38 | |||
| 2c1c7dfb35 | |||
| e246ad6f0c | |||
| 5728da11ea | |||
| 92be3f3517 | |||
| d1ddf340c8 | |||
| ec10fd0abc | |||
| 0426e3c5e1 | |||
| 4bdf7ac593 | |||
| dc7976dd9f | |||
| e4791438ed | |||
| e6e898f95d | |||
| ddcbc2f334 | |||
| a83ff278d6 | |||
| cf4cd6c24f | |||
| b960441812 | |||
| 1317028aa8 | |||
| 5e49c3e777 | |||
| 0d7c3cb51d | |||
| 1b2c440cd6 | |||
| 0f29dca988 | |||
| d24cf322e1 | |||
| d17f0fbf30 | |||
| 43ab8cfaa5 | |||
| de253d63b7 | |||
| 8bd696fa53 | |||
| bb6d8c21f9 | |||
| ebf6ef1a9b | |||
| 0c52d6ef81 | |||
| 467a4f98f1 | |||
| e614ab7806 | |||
| 2a03f93de9 | |||
| da364615fc | |||
| f08919b7d1 | |||
| 93f2c0aa08 | |||
| 4ebc9108a7 | |||
| e1ba235668 | |||
| b82f4307c9 | |||
| 76879cc160 | |||
| b25d7b5657 | |||
| e09d1753ec | |||
| 4ba8875749 | |||
| 6273fe8d3d | |||
| 9fb3ae4e6f | |||
| 76afe4edf8 | |||
| c1b06fc182 | |||
| 241b4cfe66 | |||
| 9fc983c707 | |||
| 2f99f2f506 | |||
| 338b1bf04f | |||
| e39dc46f8f | |||
| 10c75b5439 | |||
| f9582fd8f4 | |||
| f377333bd7 | |||
| f8607863d8 | |||
| 335b28f7d1 | |||
| 5e65d6b2ad | |||
| 0d4f48fa10 | |||
| 127c8b782a | |||
| cd9890544b | |||
| 067da2d1df | |||
| 046118b938 | |||
| b32260ab85 | |||
| f80e7866c0 | |||
| 31a4b3e6c4 | |||
| caf8b1c084 | |||
| 1b86bd8e18 | |||
| 59012df99b | |||
| 3d1f67616d | |||
| 6ebaf43ee4 | |||
| 0c824fc46f | |||
| eb577e4655 | |||
| 8f36850f73 | |||
| 29fd2662ba | |||
| 30a3e5af69 | |||
| a38c1bfe09 | |||
| 320feae6f5 | |||
| 1e4ecca1d0 | |||
| c0a7b89d8e | |||
| 6f59beaf0b | |||
| 41f1cf38f2 | |||
| 08d26a1b7e | |||
| 63773a6200 | |||
| 883b42896a | |||
| e1098ced95 | |||
| d100d78eb3 | |||
| 7e4cd070b0 | |||
| 46b0779996 | |||
| de342585ff | |||
| 185d8ed44f | |||
| d9836d4517 | |||
| 5f7e8a916a | |||
| 4dbdf4a294 | |||
| c6873c4e6d | |||
| 2111b4643c | |||
| c50901f3b9 | |||
| 8229280a9c | |||
| f77df94647 | |||
| f231e5bc21 | |||
| 2161efe978 | |||
| f23b4c04fd | |||
| 93540958b8 | |||
| 44b9af5bb2 | |||
| 7cd95dc8a3 | |||
| c02058c222 | |||
| b2ea5ba677 | |||
| 824a3f403f | |||
| 05f6846ede | |||
| 20db99cc69 | |||
| 6431be808f | |||
| 4727a8afa7 |
@ -5,11 +5,11 @@ import os
|
|||||||
import sys
|
import sys
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
||||||
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
|
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB
|
||||||
# Note that we have 800 MiB quota, please use it wisely.
|
# Note that we have 800 MiB quota, please use it wisely.
|
||||||
# See https://github.com/pypi/support/issues/6326 .
|
# See https://github.com/pypi/support/issues/6326 .
|
||||||
# Please also sync the value with the one in Dockerfile.
|
# Please also sync the value with the one in Dockerfile.
|
||||||
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
|
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500))
|
||||||
|
|
||||||
|
|
||||||
def print_top_10_largest_files(zip_file):
|
def print_top_10_largest_files(zip_file):
|
||||||
|
|||||||
@ -1,12 +1,12 @@
|
|||||||
# For vllm script, with -t option (tensor parallel size).
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise -b "auto" -l 1000 -f 5 -t 1
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
|
||||||
model_name: "nm-testing/Qwen2-1.5B-Instruct-W8A16-Channelwise"
|
model_name: "HandH1998/QQQ-Llama-3-8b-g128"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
metrics:
|
metrics:
|
||||||
- name: "exact_match,strict-match"
|
- name: "exact_match,strict-match"
|
||||||
value: 0.595
|
value: 0.419
|
||||||
- name: "exact_match,flexible-extract"
|
- name: "exact_match,flexible-extract"
|
||||||
value: 0.582
|
value: 0.416
|
||||||
limit: 1000
|
limit: 1000
|
||||||
num_fewshot: 5
|
num_fewshot: 5
|
||||||
@ -0,0 +1,12 @@
|
|||||||
|
# For hf script, without -t option (tensor parallel size).
|
||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
|
||||||
|
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
||||||
|
backend: "vllm-vlm"
|
||||||
|
tasks:
|
||||||
|
- name: "chartqa"
|
||||||
|
metrics:
|
||||||
|
- name: "relaxed_accuracy,none"
|
||||||
|
# TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
|
||||||
|
value: 0.80
|
||||||
|
limit: 100
|
||||||
|
num_fewshot: 0
|
||||||
@ -0,0 +1,10 @@
|
|||||||
|
# For hf script, without -t option (tensor parallel size).
|
||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
|
||||||
|
model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
|
||||||
|
tasks:
|
||||||
|
- name: "mmlu_pro"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,custom-extract"
|
||||||
|
value: 0.80
|
||||||
|
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
||||||
|
num_fewshot: 5
|
||||||
@ -1,4 +1,5 @@
|
|||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
|
# For vllm script, with -t option (tensor parallel size)
|
||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
|
||||||
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
|
||||||
tasks:
|
tasks:
|
||||||
- name: "gsm8k"
|
- name: "gsm8k"
|
||||||
|
|||||||
@ -0,0 +1,12 @@
|
|||||||
|
# For vllm script, with -t option (tensor parallel size).
|
||||||
|
# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
|
||||||
|
|
||||||
|
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
|
||||||
|
backend: "vllm-vlm"
|
||||||
|
tasks:
|
||||||
|
- name: "chartqa"
|
||||||
|
metrics:
|
||||||
|
- name: "relaxed_accuracy,none"
|
||||||
|
value: 0.855
|
||||||
|
limit: 2500
|
||||||
|
num_fewshot: 0
|
||||||
@ -0,0 +1,14 @@
|
|||||||
|
model_name: "Qwen/Qwen3-235B-A22B-Instruct-2507-FP8"
|
||||||
|
tasks:
|
||||||
|
- name: "mmlu_pro"
|
||||||
|
metrics:
|
||||||
|
- name: "exact_match,custom-extract"
|
||||||
|
value: 0.82
|
||||||
|
limit: 250 # will run on 250 * 14 subjects = 3500 samples
|
||||||
|
num_fewshot: 5
|
||||||
|
enforce_eager: false # we use false to speed up the eval process
|
||||||
|
kv_cache_dtype: fp8 # we use fp8 to speed up the eval process
|
||||||
|
max_model_len: 40960
|
||||||
|
apply_chat_template: true
|
||||||
|
fewshot_as_multiturn: true
|
||||||
|
gen_kwargs: "temperature=0,top_p=1,top_k=0,max_gen_toks=5632,until=<|ENDANSWER|>"
|
||||||
@ -0,0 +1 @@
|
|||||||
|
Qwen3-235B-A22B-Instruct-2507-FP8.yaml
|
||||||
@ -0,0 +1 @@
|
|||||||
|
Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
|
||||||
1
.buildkite/lm-eval-harness/configs/models-mm-small.txt
Normal file
1
.buildkite/lm-eval-harness/configs/models-mm-small.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
Qwen2.5-VL-7B-Instruct.yaml
|
||||||
44
.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
Executable file
44
.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
Executable file
@ -0,0 +1,44 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# We can use this script to compute baseline accuracy on chartqa for vllm.
|
||||||
|
#
|
||||||
|
# Make sure you have lm-eval-harness installed:
|
||||||
|
# pip install lm-eval==0.4.9
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo``
|
||||||
|
echo "Runs lm eval harness on ChartQA using multimodal vllm."
|
||||||
|
echo "This pathway is intended to be used to create baselines for "
|
||||||
|
echo "our correctness tests in vllm's CI."
|
||||||
|
echo
|
||||||
|
echo "usage: ${0} <options>"
|
||||||
|
echo
|
||||||
|
echo " -m - huggingface stub or local directory of the model"
|
||||||
|
echo " -l - limit number of samples to run"
|
||||||
|
echo " -t - tensor parallel size to run at"
|
||||||
|
echo
|
||||||
|
}
|
||||||
|
|
||||||
|
while getopts "m:l:t:" OPT; do
|
||||||
|
case ${OPT} in
|
||||||
|
m )
|
||||||
|
MODEL="$OPTARG"
|
||||||
|
;;
|
||||||
|
l )
|
||||||
|
LIMIT="$OPTARG"
|
||||||
|
;;
|
||||||
|
t )
|
||||||
|
TP_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
\? )
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
lm_eval --model vllm-vlm \
|
||||||
|
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
|
||||||
|
--tasks chartqa \
|
||||||
|
--batch_size auto \
|
||||||
|
--apply_chat_template \
|
||||||
|
--limit $LIMIT
|
||||||
0
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Normal file → Executable file
0
.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
Normal file → Executable file
@ -0,0 +1,50 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
|
||||||
|
# We use this for fp8, which HF does not support.
|
||||||
|
#
|
||||||
|
# Make sure you have lm-eval-harness installed:
|
||||||
|
# pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
|
||||||
|
|
||||||
|
usage() {
|
||||||
|
echo``
|
||||||
|
echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
|
||||||
|
echo "This pathway is intended to be used to create baselines for "
|
||||||
|
echo "our automated nm-test-accuracy workflow"
|
||||||
|
echo
|
||||||
|
echo "usage: ${0} <options>"
|
||||||
|
echo
|
||||||
|
echo " -m - huggingface stub or local directory of the model"
|
||||||
|
echo " -l - limit number of samples to run"
|
||||||
|
echo " -f - number of fewshot samples to use"
|
||||||
|
echo " -t - tensor parallel size to run at"
|
||||||
|
echo
|
||||||
|
}
|
||||||
|
|
||||||
|
while getopts "m:b:l:f:t:" OPT; do
|
||||||
|
case ${OPT} in
|
||||||
|
m )
|
||||||
|
MODEL="$OPTARG"
|
||||||
|
;;
|
||||||
|
b )
|
||||||
|
BATCH_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
l )
|
||||||
|
LIMIT="$OPTARG"
|
||||||
|
;;
|
||||||
|
f )
|
||||||
|
FEWSHOT="$OPTARG"
|
||||||
|
;;
|
||||||
|
t )
|
||||||
|
TP_SIZE="$OPTARG"
|
||||||
|
;;
|
||||||
|
\? )
|
||||||
|
usage
|
||||||
|
exit 1
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
done
|
||||||
|
|
||||||
|
lm_eval --model vllm \
|
||||||
|
--model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
|
||||||
|
--tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
|
||||||
|
--batch_size auto
|
||||||
@ -19,21 +19,35 @@ RTOL = 0.08
|
|||||||
def launch_lm_eval(eval_config, tp_size):
|
def launch_lm_eval(eval_config, tp_size):
|
||||||
trust_remote_code = eval_config.get("trust_remote_code", False)
|
trust_remote_code = eval_config.get("trust_remote_code", False)
|
||||||
max_model_len = eval_config.get("max_model_len", 4096)
|
max_model_len = eval_config.get("max_model_len", 4096)
|
||||||
|
batch_size = eval_config.get("batch_size", "auto")
|
||||||
|
backend = eval_config.get("backend", "vllm")
|
||||||
|
enforce_eager = eval_config.get("enforce_eager", "true")
|
||||||
|
kv_cache_dtype = eval_config.get("kv_cache_dtype", "auto")
|
||||||
model_args = (
|
model_args = (
|
||||||
f"pretrained={eval_config['model_name']},"
|
f"pretrained={eval_config['model_name']},"
|
||||||
f"tensor_parallel_size={tp_size},"
|
f"tensor_parallel_size={tp_size},"
|
||||||
f"enforce_eager=true,"
|
f"enforce_eager={enforce_eager},"
|
||||||
|
f"kv_cache_dtype={kv_cache_dtype},"
|
||||||
f"add_bos_token=true,"
|
f"add_bos_token=true,"
|
||||||
f"trust_remote_code={trust_remote_code},"
|
f"trust_remote_code={trust_remote_code},"
|
||||||
f"max_model_len={max_model_len}"
|
f"max_model_len={max_model_len},"
|
||||||
)
|
)
|
||||||
results = lm_eval.simple_evaluate(
|
results = lm_eval.simple_evaluate(
|
||||||
model="vllm",
|
model=backend,
|
||||||
model_args=model_args,
|
model_args=model_args,
|
||||||
tasks=[task["name"] for task in eval_config["tasks"]],
|
tasks=[task["name"] for task in eval_config["tasks"]],
|
||||||
num_fewshot=eval_config["num_fewshot"],
|
num_fewshot=eval_config["num_fewshot"],
|
||||||
limit=eval_config["limit"],
|
limit=eval_config["limit"],
|
||||||
batch_size="auto",
|
# TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
|
||||||
|
# text models. however, this is regressing measured strict-match for
|
||||||
|
# existing text models in CI, so only apply it for mm, or explicitly set
|
||||||
|
apply_chat_template=eval_config.get(
|
||||||
|
"apply_chat_template", backend == "vllm-vlm"
|
||||||
|
),
|
||||||
|
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", False),
|
||||||
|
# Forward decoding and early-stop controls (e.g., max_gen_toks, until=...)
|
||||||
|
gen_kwargs=eval_config.get("gen_kwargs"),
|
||||||
|
batch_size=batch_size,
|
||||||
)
|
)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|||||||
@ -1,184 +0,0 @@
|
|||||||
steps:
|
|
||||||
- label: "Wait for container to be ready"
|
|
||||||
key: wait-for-container-image
|
|
||||||
agents:
|
|
||||||
queue: A100
|
|
||||||
plugins:
|
|
||||||
- kubernetes:
|
|
||||||
podSpec:
|
|
||||||
containers:
|
|
||||||
- image: badouralix/curl-jq
|
|
||||||
command:
|
|
||||||
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
|
|
||||||
- label: "Cleanup H100"
|
|
||||||
agents:
|
|
||||||
queue: H100
|
|
||||||
depends_on: ~
|
|
||||||
command: docker system prune -a --volumes --force
|
|
||||||
|
|
||||||
- label: "A100"
|
|
||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
|
||||||
agents:
|
|
||||||
queue: A100
|
|
||||||
depends_on: wait-for-container-image
|
|
||||||
if: build.branch == "main"
|
|
||||||
plugins:
|
|
||||||
- kubernetes:
|
|
||||||
podSpec:
|
|
||||||
priorityClassName: perf-benchmark
|
|
||||||
containers:
|
|
||||||
- image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
|
||||||
command:
|
|
||||||
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
nvidia.com/gpu: 8
|
|
||||||
volumeMounts:
|
|
||||||
- name: devshm
|
|
||||||
mountPath: /dev/shm
|
|
||||||
env:
|
|
||||||
- name: VLLM_USAGE_SOURCE
|
|
||||||
value: ci-test
|
|
||||||
- name: HF_TOKEN
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: hf-token-secret
|
|
||||||
key: token
|
|
||||||
nodeSelector:
|
|
||||||
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
|
||||||
volumes:
|
|
||||||
- name: devshm
|
|
||||||
emptyDir:
|
|
||||||
medium: Memory
|
|
||||||
|
|
||||||
- label: "H200"
|
|
||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
|
||||||
agents:
|
|
||||||
queue: H200
|
|
||||||
depends_on: wait-for-container-image
|
|
||||||
if: build.branch == "main"
|
|
||||||
plugins:
|
|
||||||
- docker#v5.12.0:
|
|
||||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
|
||||||
command:
|
|
||||||
- bash
|
|
||||||
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
|
||||||
mount-buildkite-agent: true
|
|
||||||
propagate-environment: true
|
|
||||||
ipc: host
|
|
||||||
gpus: 4,5,6,7
|
|
||||||
volumes:
|
|
||||||
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
|
||||||
environment:
|
|
||||||
- VLLM_USAGE_SOURCE
|
|
||||||
- HF_TOKEN
|
|
||||||
|
|
||||||
#- block: "Run H100 Benchmark"
|
|
||||||
#key: block-h100
|
|
||||||
#depends_on: ~
|
|
||||||
|
|
||||||
- label: "H100"
|
|
||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
|
||||||
agents:
|
|
||||||
queue: H100
|
|
||||||
depends_on: wait-for-container-image
|
|
||||||
if: build.branch == "main"
|
|
||||||
plugins:
|
|
||||||
- docker#v5.12.0:
|
|
||||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
|
|
||||||
command:
|
|
||||||
- bash
|
|
||||||
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
|
||||||
mount-buildkite-agent: true
|
|
||||||
propagate-environment: true
|
|
||||||
ipc: host
|
|
||||||
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
|
|
||||||
volumes:
|
|
||||||
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
|
||||||
environment:
|
|
||||||
- VLLM_USAGE_SOURCE
|
|
||||||
- HF_TOKEN
|
|
||||||
|
|
||||||
# Premerge benchmark
|
|
||||||
- label: "A100"
|
|
||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
|
||||||
agents:
|
|
||||||
queue: A100
|
|
||||||
depends_on: wait-for-container-image
|
|
||||||
if: build.branch != "main"
|
|
||||||
plugins:
|
|
||||||
- kubernetes:
|
|
||||||
podSpec:
|
|
||||||
priorityClassName: perf-benchmark
|
|
||||||
containers:
|
|
||||||
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
|
||||||
command:
|
|
||||||
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
nvidia.com/gpu: 8
|
|
||||||
volumeMounts:
|
|
||||||
- name: devshm
|
|
||||||
mountPath: /dev/shm
|
|
||||||
env:
|
|
||||||
- name: VLLM_USAGE_SOURCE
|
|
||||||
value: ci-test
|
|
||||||
- name: HF_TOKEN
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: hf-token-secret
|
|
||||||
key: token
|
|
||||||
nodeSelector:
|
|
||||||
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
|
||||||
volumes:
|
|
||||||
- name: devshm
|
|
||||||
emptyDir:
|
|
||||||
medium: Memory
|
|
||||||
|
|
||||||
- label: "H200"
|
|
||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
|
||||||
agents:
|
|
||||||
queue: H200
|
|
||||||
depends_on: wait-for-container-image
|
|
||||||
if: build.branch != "main"
|
|
||||||
plugins:
|
|
||||||
- docker#v5.12.0:
|
|
||||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
|
||||||
command:
|
|
||||||
- bash
|
|
||||||
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
|
||||||
mount-buildkite-agent: true
|
|
||||||
propagate-environment: true
|
|
||||||
ipc: host
|
|
||||||
gpus: 4,5,6,7
|
|
||||||
volumes:
|
|
||||||
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
|
||||||
environment:
|
|
||||||
- VLLM_USAGE_SOURCE
|
|
||||||
- HF_TOKEN
|
|
||||||
|
|
||||||
#- block: "Run H100 Benchmark"
|
|
||||||
#key: block-h100
|
|
||||||
#depends_on: ~
|
|
||||||
|
|
||||||
- label: "H100"
|
|
||||||
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
|
|
||||||
agents:
|
|
||||||
queue: H100
|
|
||||||
depends_on: wait-for-container-image
|
|
||||||
if: build.branch != "main"
|
|
||||||
plugins:
|
|
||||||
- docker#v5.12.0:
|
|
||||||
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
|
|
||||||
command:
|
|
||||||
- bash
|
|
||||||
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
|
||||||
mount-buildkite-agent: true
|
|
||||||
propagate-environment: true
|
|
||||||
ipc: host
|
|
||||||
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
|
|
||||||
volumes:
|
|
||||||
- /data/benchmark-hf-cache:/root/.cache/huggingface
|
|
||||||
environment:
|
|
||||||
- VLLM_USAGE_SOURCE
|
|
||||||
- HF_TOKEN
|
|
||||||
@ -1,28 +0,0 @@
|
|||||||
# Nightly benchmark annotation
|
|
||||||
|
|
||||||
## Description
|
|
||||||
|
|
||||||
This file contains the downloading link for benchmarking results.
|
|
||||||
|
|
||||||
- [benchmarking pipeline](artifact://nightly-pipeline.yaml)
|
|
||||||
- [benchmarking results](artifact://results.zip)
|
|
||||||
- [benchmarking code](artifact://nightly-benchmarks.zip)
|
|
||||||
|
|
||||||
Please download the visualization scripts in the post
|
|
||||||
|
|
||||||
## Results reproduction
|
|
||||||
|
|
||||||
- Find the docker we use in `benchmarking pipeline`
|
|
||||||
- Deploy the docker, and inside the docker:
|
|
||||||
- Download `nightly-benchmarks.zip`.
|
|
||||||
- In the same folder, run the following code:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
export HF_TOKEN=<your HF token>
|
|
||||||
apt update
|
|
||||||
apt install -y git
|
|
||||||
unzip nightly-benchmarks.zip
|
|
||||||
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
|
||||||
```
|
|
||||||
|
|
||||||
And the results will be inside `./benchmarks/results`.
|
|
||||||
@ -1,39 +0,0 @@
|
|||||||
|
|
||||||
# Nightly benchmark
|
|
||||||
|
|
||||||
This benchmark aims to:
|
|
||||||
|
|
||||||
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
|
|
||||||
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
|
|
||||||
|
|
||||||
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
|
|
||||||
|
|
||||||
Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
|
|
||||||
|
|
||||||
## Setup
|
|
||||||
|
|
||||||
- Docker images:
|
|
||||||
- vLLM: `vllm/vllm-openai:v0.6.2`
|
|
||||||
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
|
|
||||||
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
|
|
||||||
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
|
|
||||||
- *NOTE: we use r24.07 as the current implementation only works for this version. We are going to bump this up.*
|
|
||||||
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
|
|
||||||
- Hardware
|
|
||||||
- 8x Nvidia A100 GPUs
|
|
||||||
- Workload:
|
|
||||||
- Dataset
|
|
||||||
- ShareGPT dataset
|
|
||||||
- Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
|
|
||||||
- Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
|
|
||||||
- Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
|
|
||||||
- Models: llama-3 8B, llama-3 70B.
|
|
||||||
- We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
|
|
||||||
- Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
|
|
||||||
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
|
|
||||||
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
|
|
||||||
|
|
||||||
## Known issues
|
|
||||||
|
|
||||||
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
|
|
||||||
- TGI does not support `ignore-eos` flag.
|
|
||||||
@ -1,196 +0,0 @@
|
|||||||
common_pod_spec: &common_pod_spec
|
|
||||||
priorityClassName: perf-benchmark
|
|
||||||
nodeSelector:
|
|
||||||
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
|
||||||
volumes:
|
|
||||||
- name: devshm
|
|
||||||
emptyDir:
|
|
||||||
medium: Memory
|
|
||||||
- name: hf-cache
|
|
||||||
hostPath:
|
|
||||||
path: /root/.cache/huggingface
|
|
||||||
type: Directory
|
|
||||||
|
|
||||||
common_container_settings: &common_container_settings
|
|
||||||
command:
|
|
||||||
- bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
nvidia.com/gpu: 8
|
|
||||||
volumeMounts:
|
|
||||||
- name: devshm
|
|
||||||
mountPath: /dev/shm
|
|
||||||
- name: hf-cache
|
|
||||||
mountPath: /root/.cache/huggingface
|
|
||||||
env:
|
|
||||||
- name: VLLM_USAGE_SOURCE
|
|
||||||
value: ci-test
|
|
||||||
- name: HF_HOME
|
|
||||||
value: /root/.cache/huggingface
|
|
||||||
- name: VLLM_SOURCE_CODE_LOC
|
|
||||||
value: /workspace/build/buildkite/vllm/performance-benchmark
|
|
||||||
- name: HF_TOKEN
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: hf-token-secret
|
|
||||||
key: token
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
- label: "A100 vllm step 10"
|
|
||||||
priority: 100
|
|
||||||
agents:
|
|
||||||
queue: A100
|
|
||||||
plugins:
|
|
||||||
- kubernetes:
|
|
||||||
podSpec:
|
|
||||||
<<: *common_pod_spec
|
|
||||||
containers:
|
|
||||||
- image: vllm/vllm-openai:v0.6.2
|
|
||||||
<<: *common_container_settings
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
- label: "A100 sglang benchmark"
|
|
||||||
priority: 100
|
|
||||||
agents:
|
|
||||||
queue: A100
|
|
||||||
plugins:
|
|
||||||
- kubernetes:
|
|
||||||
podSpec:
|
|
||||||
<<: *common_pod_spec
|
|
||||||
containers:
|
|
||||||
- image: lmsysorg/sglang:v0.3.2-cu121
|
|
||||||
<<: *common_container_settings
|
|
||||||
|
|
||||||
- label: "A100 lmdeploy benchmark"
|
|
||||||
priority: 100
|
|
||||||
agents:
|
|
||||||
queue: A100
|
|
||||||
plugins:
|
|
||||||
- kubernetes:
|
|
||||||
podSpec:
|
|
||||||
<<: *common_pod_spec
|
|
||||||
containers:
|
|
||||||
- image: openmmlab/lmdeploy:v0.6.1-cu12
|
|
||||||
<<: *common_container_settings
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
- label: "A100 trt llama-8B"
|
|
||||||
priority: 100
|
|
||||||
agents:
|
|
||||||
queue: A100
|
|
||||||
plugins:
|
|
||||||
- kubernetes:
|
|
||||||
podSpec:
|
|
||||||
<<: *common_pod_spec
|
|
||||||
containers:
|
|
||||||
- image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
|
|
||||||
<<: *common_container_settings
|
|
||||||
env:
|
|
||||||
- name: VLLM_USAGE_SOURCE
|
|
||||||
value: ci-test
|
|
||||||
- name: HF_HOME
|
|
||||||
value: /root/.cache/huggingface
|
|
||||||
- name: VLLM_SOURCE_CODE_LOC
|
|
||||||
value: /workspace/build/buildkite/vllm/performance-benchmark
|
|
||||||
- name: HF_TOKEN
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: hf-token-secret
|
|
||||||
key: token
|
|
||||||
- name: TEST_SELECTOR
|
|
||||||
value: "llama8B"
|
|
||||||
|
|
||||||
|
|
||||||
- label: "A100 trt llama-70B"
|
|
||||||
priority: 100
|
|
||||||
agents:
|
|
||||||
queue: A100
|
|
||||||
plugins:
|
|
||||||
- kubernetes:
|
|
||||||
podSpec:
|
|
||||||
<<: *common_pod_spec
|
|
||||||
containers:
|
|
||||||
- image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
|
|
||||||
<<: *common_container_settings
|
|
||||||
env:
|
|
||||||
- name: VLLM_USAGE_SOURCE
|
|
||||||
value: ci-test
|
|
||||||
- name: HF_HOME
|
|
||||||
value: /root/.cache/huggingface
|
|
||||||
- name: VLLM_SOURCE_CODE_LOC
|
|
||||||
value: /workspace/build/buildkite/vllm/performance-benchmark
|
|
||||||
- name: HF_TOKEN
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: hf-token-secret
|
|
||||||
key: token
|
|
||||||
- name: TEST_SELECTOR
|
|
||||||
value: "llama70B"
|
|
||||||
|
|
||||||
|
|
||||||
# FIXME(Kuntai): uncomment this after NVIDIA gives us their test docker image
|
|
||||||
# - label: "A100 trt benchmark"
|
|
||||||
# priority: 100
|
|
||||||
# agents:
|
|
||||||
# queue: A100
|
|
||||||
# plugins:
|
|
||||||
# - kubernetes:
|
|
||||||
# podSpec:
|
|
||||||
# <<: *common_pod_spec
|
|
||||||
# containers:
|
|
||||||
# - image: nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3
|
|
||||||
# <<: *common_container_settings
|
|
||||||
|
|
||||||
|
|
||||||
# FIXME(Kuntai): uncomment this after TGI supports `--ignore-eos`.
|
|
||||||
# - label: "A100 tgi benchmark"
|
|
||||||
# priority: 100
|
|
||||||
# agents:
|
|
||||||
# queue: A100
|
|
||||||
# plugins:
|
|
||||||
# - kubernetes:
|
|
||||||
# podSpec:
|
|
||||||
# <<: *common_pod_spec
|
|
||||||
# containers:
|
|
||||||
# - image: ghcr.io/huggingface/text-generation-inference:2.2.0
|
|
||||||
# <<: *common_container_settings
|
|
||||||
|
|
||||||
- wait
|
|
||||||
|
|
||||||
- label: "Collect the results"
|
|
||||||
priority: 100
|
|
||||||
agents:
|
|
||||||
queue: A100
|
|
||||||
plugins:
|
|
||||||
- kubernetes:
|
|
||||||
podSpec:
|
|
||||||
<<: *common_pod_spec
|
|
||||||
containers:
|
|
||||||
- image: vllm/vllm-openai:v0.5.0.post1
|
|
||||||
command:
|
|
||||||
- bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
|
|
||||||
resources:
|
|
||||||
limits:
|
|
||||||
nvidia.com/gpu: 8
|
|
||||||
volumeMounts:
|
|
||||||
- name: devshm
|
|
||||||
mountPath: /dev/shm
|
|
||||||
env:
|
|
||||||
- name: VLLM_USAGE_SOURCE
|
|
||||||
value: ci-test
|
|
||||||
- name: VLLM_SOURCE_CODE_LOC
|
|
||||||
value: /workspace/build/buildkite/vllm/performance-benchmark
|
|
||||||
- name: HF_TOKEN
|
|
||||||
valueFrom:
|
|
||||||
secretKeyRef:
|
|
||||||
name: hf-token-secret
|
|
||||||
key: token
|
|
||||||
|
|
||||||
- block: ":rocket: check the results!"
|
|
||||||
@ -1,26 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
|
|
||||||
from transformers import AutoTokenizer
|
|
||||||
|
|
||||||
|
|
||||||
def main(model, cachedir):
|
|
||||||
# Load the tokenizer and save it to the specified directory
|
|
||||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
|
||||||
tokenizer.save_pretrained(cachedir)
|
|
||||||
print(f"Tokenizer saved to {cachedir}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Download and save Hugging Face tokenizer"
|
|
||||||
)
|
|
||||||
parser.add_argument("--model", type=str, required=True, help="Name of the model")
|
|
||||||
parser.add_argument(
|
|
||||||
"--cachedir", type=str, required=True, help="Directory to save the tokenizer"
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
main(args.model, args.cachedir)
|
|
||||||
@ -1,97 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
import json
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
from tabulate import tabulate
|
|
||||||
|
|
||||||
|
|
||||||
def parse_arguments():
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="Parse command line arguments for summary-nightly-results script."
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--results-folder",
|
|
||||||
type=str,
|
|
||||||
required=True,
|
|
||||||
help="The folder where the results are stored.",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--description", type=str, required=True, help="Description of the results."
|
|
||||||
)
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
|
||||||
return args
|
|
||||||
|
|
||||||
|
|
||||||
def get_perf(df, method, model, metric):
|
|
||||||
means = []
|
|
||||||
|
|
||||||
for qps in [2, 4, 8, 16, "inf"]:
|
|
||||||
target = df["Test name"].str.contains(model)
|
|
||||||
target = target & df["Engine"].str.contains(method)
|
|
||||||
target = target & df["Test name"].str.contains("qps_" + str(qps))
|
|
||||||
filtered_df = df[target]
|
|
||||||
|
|
||||||
if filtered_df.empty:
|
|
||||||
means.append(0.0)
|
|
||||||
else:
|
|
||||||
means.append(filtered_df[metric].values[0])
|
|
||||||
|
|
||||||
return np.array(means)
|
|
||||||
|
|
||||||
|
|
||||||
def get_perf_w_std(df, method, model, metric):
|
|
||||||
if metric in ["TTFT", "ITL"]:
|
|
||||||
mean = get_perf(df, method, model, "Mean " + metric + " (ms)")
|
|
||||||
mean = mean.tolist()
|
|
||||||
std = get_perf(df, method, model, "Std " + metric + " (ms)")
|
|
||||||
if std.mean() == 0:
|
|
||||||
std = None
|
|
||||||
success = get_perf(df, method, model, "Successful req.")
|
|
||||||
if std is not None:
|
|
||||||
std = std / np.sqrt(success)
|
|
||||||
std = std.tolist()
|
|
||||||
|
|
||||||
else:
|
|
||||||
assert metric == "Tput"
|
|
||||||
mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf(
|
|
||||||
df, method, model, "Output Tput (tok/s)"
|
|
||||||
)
|
|
||||||
mean = mean.tolist()
|
|
||||||
std = None
|
|
||||||
|
|
||||||
return mean, std
|
|
||||||
|
|
||||||
|
|
||||||
def main(args):
|
|
||||||
results_folder = Path(args.results_folder)
|
|
||||||
|
|
||||||
results = []
|
|
||||||
|
|
||||||
# collect results
|
|
||||||
for test_file in results_folder.glob("*_nightly_results.json"):
|
|
||||||
with open(test_file) as f:
|
|
||||||
results = results + json.loads(f.read())
|
|
||||||
|
|
||||||
# generate markdown table
|
|
||||||
df = pd.DataFrame.from_dict(results)
|
|
||||||
|
|
||||||
md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False)
|
|
||||||
|
|
||||||
with open(args.description) as f:
|
|
||||||
description = f.read()
|
|
||||||
|
|
||||||
description = description.format(nightly_results_benchmarking_table=md_table)
|
|
||||||
|
|
||||||
with open("nightly_results.md", "w") as f:
|
|
||||||
f.write(description)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
args = parse_arguments()
|
|
||||||
main(args)
|
|
||||||
@ -1,9 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
from lmdeploy.serve.openai.api_client import APIClient
|
|
||||||
|
|
||||||
api_client = APIClient("http://localhost:8000")
|
|
||||||
model_name = api_client.available_models[0]
|
|
||||||
|
|
||||||
print(model_name)
|
|
||||||
@ -1,78 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -ex
|
|
||||||
set -o pipefail
|
|
||||||
|
|
||||||
|
|
||||||
main() {
|
|
||||||
|
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
|
||||||
(which jq) || (apt-get update && apt-get -y install jq)
|
|
||||||
(which zip) || (apt-get install -y zip)
|
|
||||||
|
|
||||||
if [ ! -f /workspace/buildkite-agent ]; then
|
|
||||||
echo "buildkite-agent binary not found. Skip plotting the results."
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
# initial annotation
|
|
||||||
#description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
|
|
||||||
|
|
||||||
# download results
|
|
||||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
|
||||||
mkdir -p results/
|
|
||||||
/workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
|
|
||||||
ls
|
|
||||||
ls results/
|
|
||||||
|
|
||||||
# upload benchmark results
|
|
||||||
zip -r results.zip results/
|
|
||||||
/workspace/buildkite-agent artifact upload "results.zip"
|
|
||||||
|
|
||||||
# upload benchmarking scripts
|
|
||||||
cd "$VLLM_SOURCE_CODE_LOC/"
|
|
||||||
zip -r nightly-benchmarks.zip .buildkite/ benchmarks/
|
|
||||||
/workspace/buildkite-agent artifact upload "nightly-benchmarks.zip"
|
|
||||||
|
|
||||||
cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
|
|
||||||
# upload benchmarking pipeline
|
|
||||||
/workspace/buildkite-agent artifact upload "nightly-pipeline.yaml"
|
|
||||||
|
|
||||||
cd "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
|
|
||||||
/workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly-annotation.md
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# The figures should be generated by a separate process outside the CI/CD pipeline
|
|
||||||
|
|
||||||
# # generate figures
|
|
||||||
# python3 -m pip install tabulate pandas matplotlib
|
|
||||||
|
|
||||||
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py \
|
|
||||||
# --description $description \
|
|
||||||
# --results-folder results/
|
|
||||||
|
|
||||||
|
|
||||||
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
|
||||||
# --description $description \
|
|
||||||
# --results-folder results/ \
|
|
||||||
# --dataset sharegpt
|
|
||||||
|
|
||||||
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
|
||||||
# --description $description \
|
|
||||||
# --results-folder results/ \
|
|
||||||
# --dataset sonnet_2048_128
|
|
||||||
|
|
||||||
# python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
|
|
||||||
# --description $description \
|
|
||||||
# --results-folder results/ \
|
|
||||||
# --dataset sonnet_128_2048
|
|
||||||
|
|
||||||
# # upload results and figures
|
|
||||||
# /workspace/buildkite-agent artifact upload "nightly_results*.png"
|
|
||||||
# /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
|
|
||||||
# /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
|
|
||||||
# /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
|
|
||||||
}
|
|
||||||
|
|
||||||
main "$@"
|
|
||||||
@ -1,464 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
|
|
||||||
set -o pipefail
|
|
||||||
set -x
|
|
||||||
|
|
||||||
check_gpus() {
|
|
||||||
# check the number of GPUs and GPU type.
|
|
||||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
|
||||||
if [[ $gpu_count -gt 0 ]]; then
|
|
||||||
echo "GPU found."
|
|
||||||
else
|
|
||||||
echo "Need at least 1 GPU to run benchmarking."
|
|
||||||
exit 1
|
|
||||||
fi
|
|
||||||
declare -g gpu_type="$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')"
|
|
||||||
echo "GPU type is $gpu_type"
|
|
||||||
}
|
|
||||||
|
|
||||||
check_hf_token() {
|
|
||||||
# check if HF_TOKEN is available and valid
|
|
||||||
if [[ -z "$HF_TOKEN" ]]; then
|
|
||||||
echo "Error: HF_TOKEN is not set."
|
|
||||||
exit 1
|
|
||||||
elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
|
|
||||||
echo "Error: HF_TOKEN does not start with 'hf_'."
|
|
||||||
exit 1
|
|
||||||
else
|
|
||||||
echo "HF_TOKEN is set and valid."
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
upload_to_buildkite() {
|
|
||||||
# upload the benchmarking results to buildkite
|
|
||||||
|
|
||||||
# if the agent binary is not found, skip uploading the results, exit 0
|
|
||||||
if [ ! -f /workspace/buildkite-agent ]; then
|
|
||||||
echo "buildkite-agent binary not found. Skip uploading the results."
|
|
||||||
return 0
|
|
||||||
fi
|
|
||||||
# /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
|
|
||||||
/workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
get_current_llm_serving_engine() {
|
|
||||||
|
|
||||||
if which lmdeploy >/dev/null; then
|
|
||||||
echo "Container: lmdeploy"
|
|
||||||
export CURRENT_LLM_SERVING_ENGINE=lmdeploy
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ -e /tgi-entrypoint.sh ]; then
|
|
||||||
echo "Container: tgi"
|
|
||||||
export CURRENT_LLM_SERVING_ENGINE=tgi
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
|
|
||||||
if which trtllm-build >/dev/null; then
|
|
||||||
echo "Container: tensorrt-llm"
|
|
||||||
export CURRENT_LLM_SERVING_ENGINE=trt
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ -e /sgl-workspace ]; then
|
|
||||||
echo "Container: sglang"
|
|
||||||
export CURRENT_LLM_SERVING_ENGINE=sglang
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [ -e /vllm-workspace ]; then
|
|
||||||
echo "Container: vllm"
|
|
||||||
# move to a completely irrelevant directory, to avoid import vllm from current folder
|
|
||||||
export CURRENT_LLM_SERVING_ENGINE=vllm
|
|
||||||
|
|
||||||
return
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
json2args() {
|
|
||||||
# transforms the JSON string to command line args, and '_' is replaced to '-'
|
|
||||||
# example:
|
|
||||||
# input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
|
|
||||||
# output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
|
|
||||||
local json_string=$1
|
|
||||||
local args=$(
|
|
||||||
echo "$json_string" | jq -r '
|
|
||||||
to_entries |
|
|
||||||
map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
|
|
||||||
join(" ")
|
|
||||||
'
|
|
||||||
)
|
|
||||||
echo "$args"
|
|
||||||
}
|
|
||||||
|
|
||||||
kill_gpu_processes() {
|
|
||||||
pkill -f '[p]ython'
|
|
||||||
pkill -f '[p]ython3'
|
|
||||||
pkill -f '[t]ritonserver'
|
|
||||||
pkill -f '[p]t_main_thread'
|
|
||||||
pkill -f '[t]ext-generation'
|
|
||||||
pkill -f '[l]mdeploy'
|
|
||||||
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
|
|
||||||
pkill -f '[V]LLM'
|
|
||||||
|
|
||||||
while [ "$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits | head -n 1)" -ge 1000 ]; do
|
|
||||||
sleep 1
|
|
||||||
done
|
|
||||||
}
|
|
||||||
|
|
||||||
wait_for_server() {
|
|
||||||
# wait for vllm server to start
|
|
||||||
# return 1 if vllm server crashes
|
|
||||||
timeout 1200 bash -c '
|
|
||||||
until curl -s localhost:8000/v1/completions > /dev/null; do
|
|
||||||
sleep 1
|
|
||||||
done' && return 0 || return 1
|
|
||||||
}
|
|
||||||
|
|
||||||
ensure_installed() {
|
|
||||||
# Ensure that the given command is installed by apt-get
|
|
||||||
local cmd=$1
|
|
||||||
if ! which "$cmd" >/dev/null; then
|
|
||||||
apt-get update && apt-get install -y "$cmd"
|
|
||||||
fi
|
|
||||||
}
|
|
||||||
|
|
||||||
run_serving_tests() {
|
|
||||||
# run serving tests using `vllm bench serve` command
|
|
||||||
# $1: a json file specifying serving test cases
|
|
||||||
|
|
||||||
local serving_test_file
|
|
||||||
serving_test_file=$1
|
|
||||||
|
|
||||||
# Iterate over serving tests
|
|
||||||
jq -c '.[]' "$serving_test_file" | while read -r params; do
|
|
||||||
# get the test name, and append the GPU type back to it.
|
|
||||||
test_name=$(echo "$params" | jq -r '.test_name')
|
|
||||||
|
|
||||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
|
||||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
|
||||||
echo "Skip test case $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
# prepend the current serving engine to the test name
|
|
||||||
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
|
||||||
|
|
||||||
# get common parameters
|
|
||||||
common_params=$(echo "$params" | jq -r '.common_parameters')
|
|
||||||
model=$(echo "$common_params" | jq -r '.model')
|
|
||||||
tp=$(echo "$common_params" | jq -r '.tp')
|
|
||||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
|
||||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
|
||||||
port=$(echo "$common_params" | jq -r '.port')
|
|
||||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
|
||||||
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
|
|
||||||
|
|
||||||
# get client and server arguments
|
|
||||||
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
|
|
||||||
client_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_client_parameters")
|
|
||||||
client_args=$(json2args "$client_params")
|
|
||||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
|
||||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
|
||||||
echo "Running over qps list $qps_list"
|
|
||||||
|
|
||||||
# check if there is enough GPU to run the test
|
|
||||||
if [[ $gpu_count -lt $tp ]]; then
|
|
||||||
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $reuse_server == "true" ]]; then
|
|
||||||
echo "Reuse previous server for test case $test_name"
|
|
||||||
else
|
|
||||||
kill_gpu_processes
|
|
||||||
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
|
|
||||||
"$server_params" "$common_params"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if wait_for_server; then
|
|
||||||
echo ""
|
|
||||||
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
|
|
||||||
else
|
|
||||||
echo ""
|
|
||||||
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
|
|
||||||
# prepare tokenizer
|
|
||||||
# this is required for lmdeploy.
|
|
||||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
|
||||||
rm -rf /tokenizer_cache
|
|
||||||
mkdir /tokenizer_cache
|
|
||||||
python3 ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
|
|
||||||
--model "$model" \
|
|
||||||
--cachedir /tokenizer_cache
|
|
||||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
|
||||||
|
|
||||||
|
|
||||||
# change model name for lmdeploy (it will not follow standard hf name)
|
|
||||||
if [[ "$CURRENT_LLM_SERVING_ENGINE" == "lmdeploy" ]]; then
|
|
||||||
model=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
|
|
||||||
fi
|
|
||||||
|
|
||||||
# iterate over different QPS
|
|
||||||
for qps in $qps_list; do
|
|
||||||
# remove the surrounding single quote from qps
|
|
||||||
if [[ "$qps" == *"inf"* ]]; then
|
|
||||||
echo "qps was $qps"
|
|
||||||
qps="inf"
|
|
||||||
echo "now qps is $qps"
|
|
||||||
fi
|
|
||||||
|
|
||||||
new_test_name=$test_name"_qps_"$qps
|
|
||||||
|
|
||||||
backend=$CURRENT_LLM_SERVING_ENGINE
|
|
||||||
|
|
||||||
if [[ $backend = "trt" ]]; then
|
|
||||||
backend="tensorrt-llm"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "$backend" == *"vllm"* ]]; then
|
|
||||||
backend="vllm"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ "$dataset_name" = "sharegpt" ]]; then
|
|
||||||
|
|
||||||
client_command="vllm bench serve \
|
|
||||||
--backend $backend \
|
|
||||||
--tokenizer /tokenizer_cache \
|
|
||||||
--model $model \
|
|
||||||
--dataset-name $dataset_name \
|
|
||||||
--dataset-path $dataset_path \
|
|
||||||
--num-prompts $num_prompts \
|
|
||||||
--port $port \
|
|
||||||
--save-result \
|
|
||||||
--result-dir $RESULTS_FOLDER \
|
|
||||||
--result-filename ${new_test_name}.json \
|
|
||||||
--request-rate $qps \
|
|
||||||
--ignore-eos \
|
|
||||||
$client_args"
|
|
||||||
|
|
||||||
elif [[ "$dataset_name" = "sonnet" ]]; then
|
|
||||||
|
|
||||||
sonnet_input_len=$(echo "$common_params" | jq -r '.sonnet_input_len')
|
|
||||||
sonnet_output_len=$(echo "$common_params" | jq -r '.sonnet_output_len')
|
|
||||||
sonnet_prefix_len=$(echo "$common_params" | jq -r '.sonnet_prefix_len')
|
|
||||||
|
|
||||||
client_command="vllm bench serve \
|
|
||||||
--backend $backend \
|
|
||||||
--tokenizer /tokenizer_cache \
|
|
||||||
--model $model \
|
|
||||||
--dataset-name $dataset_name \
|
|
||||||
--dataset-path $dataset_path \
|
|
||||||
--num-prompts $num_prompts \
|
|
||||||
--sonnet-input-len $sonnet_input_len \
|
|
||||||
--sonnet-output-len $sonnet_output_len \
|
|
||||||
--sonnet-prefix-len $sonnet_prefix_len \
|
|
||||||
--port $port \
|
|
||||||
--save-result \
|
|
||||||
--result-dir $RESULTS_FOLDER \
|
|
||||||
--result-filename ${new_test_name}.json \
|
|
||||||
--request-rate $qps \
|
|
||||||
--ignore-eos \
|
|
||||||
$client_args"
|
|
||||||
|
|
||||||
else
|
|
||||||
|
|
||||||
echo "The dataset name must be either 'sharegpt' or 'sonnet'. Got $dataset_name."
|
|
||||||
exit 1
|
|
||||||
|
|
||||||
fi
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
echo "Running test case $test_name with qps $qps"
|
|
||||||
echo "Client command: $client_command"
|
|
||||||
|
|
||||||
eval "$client_command"
|
|
||||||
|
|
||||||
server_command="None"
|
|
||||||
|
|
||||||
# record the benchmarking commands
|
|
||||||
jq_output=$(jq -n \
|
|
||||||
--arg server "$server_command" \
|
|
||||||
--arg client "$client_command" \
|
|
||||||
--arg gpu "$gpu_type" \
|
|
||||||
--arg engine "$CURRENT_LLM_SERVING_ENGINE" \
|
|
||||||
'{
|
|
||||||
server_command: $server,
|
|
||||||
client_command: $client,
|
|
||||||
gpu_type: $gpu,
|
|
||||||
engine: $engine
|
|
||||||
}')
|
|
||||||
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
|
|
||||||
|
|
||||||
done
|
|
||||||
|
|
||||||
done
|
|
||||||
|
|
||||||
kill_gpu_processes
|
|
||||||
}
|
|
||||||
|
|
||||||
run_genai_perf_tests() {
|
|
||||||
# run genai-perf tests
|
|
||||||
|
|
||||||
# $1: a json file specifying genai-perf test cases
|
|
||||||
local genai_perf_test_file
|
|
||||||
genai_perf_test_file=$1
|
|
||||||
|
|
||||||
# Iterate over genai-perf tests
|
|
||||||
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
|
|
||||||
# get the test name, and append the GPU type back to it.
|
|
||||||
test_name=$(echo "$params" | jq -r '.test_name')
|
|
||||||
|
|
||||||
# if TEST_SELECTOR is set, only run the test cases that match the selector
|
|
||||||
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
|
|
||||||
echo "Skip test case $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
# prepend the current serving engine to the test name
|
|
||||||
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}
|
|
||||||
|
|
||||||
# get common parameters
|
|
||||||
common_params=$(echo "$params" | jq -r '.common_parameters')
|
|
||||||
model=$(echo "$common_params" | jq -r '.model')
|
|
||||||
tp=$(echo "$common_params" | jq -r '.tp')
|
|
||||||
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
|
|
||||||
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
|
|
||||||
port=$(echo "$common_params" | jq -r '.port')
|
|
||||||
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
|
|
||||||
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')
|
|
||||||
|
|
||||||
# get client and server arguments
|
|
||||||
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
|
|
||||||
qps_list=$(echo "$params" | jq -r '.qps_list')
|
|
||||||
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
|
|
||||||
echo "Running over qps list $qps_list"
|
|
||||||
|
|
||||||
# check if there is enough GPU to run the test
|
|
||||||
if [[ $gpu_count -lt $tp ]]; then
|
|
||||||
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
|
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
if [[ $reuse_server == "true" ]]; then
|
|
||||||
echo "Reuse previous server for test case $test_name"
|
|
||||||
else
|
|
||||||
kill_gpu_processes
|
|
||||||
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
|
|
||||||
"$server_params" "$common_params"
|
|
||||||
fi
|
|
||||||
|
|
||||||
if wait_for_server; then
|
|
||||||
echo ""
|
|
||||||
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
|
|
||||||
else
|
|
||||||
echo ""
|
|
||||||
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
|
|
||||||
break
|
|
||||||
fi
|
|
||||||
|
|
||||||
# iterate over different QPS
|
|
||||||
for qps in $qps_list; do
|
|
||||||
# remove the surrounding single quote from qps
|
|
||||||
if [[ "$qps" == *"inf"* ]]; then
|
|
||||||
echo "qps was $qps"
|
|
||||||
qps=$num_prompts
|
|
||||||
echo "now qps is $qps"
|
|
||||||
fi
|
|
||||||
|
|
||||||
new_test_name=$test_name"_qps_"$qps
|
|
||||||
backend=$CURRENT_LLM_SERVING_ENGINE
|
|
||||||
|
|
||||||
if [[ "$backend" == *"vllm"* ]]; then
|
|
||||||
backend="vllm"
|
|
||||||
fi
|
|
||||||
#TODO: add output dir.
|
|
||||||
client_command="genai-perf profile \
|
|
||||||
-m $model \
|
|
||||||
--service-kind openai \
|
|
||||||
--backend "$backend" \
|
|
||||||
--endpoint-type chat \
|
|
||||||
--streaming \
|
|
||||||
--url localhost:$port \
|
|
||||||
--request-rate $qps \
|
|
||||||
--num-prompts $num_prompts \
|
|
||||||
"
|
|
||||||
|
|
||||||
echo "Client command: $client_command"
|
|
||||||
|
|
||||||
eval "$client_command"
|
|
||||||
|
|
||||||
#TODO: process/record outputs
|
|
||||||
done
|
|
||||||
done
|
|
||||||
|
|
||||||
kill_gpu_processes
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
prepare_dataset() {
|
|
||||||
|
|
||||||
# download sharegpt dataset
|
|
||||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
|
||||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
|
||||||
|
|
||||||
# duplicate sonnet by 4x, to allow benchmarking with input length 2048
|
|
||||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
|
||||||
echo "" > sonnet_4x.txt
|
|
||||||
for _ in {1..4}
|
|
||||||
do
|
|
||||||
cat sonnet.txt >> sonnet_4x.txt
|
|
||||||
done
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
main() {
|
|
||||||
|
|
||||||
# check if the environment variable is successfully injected from yaml
|
|
||||||
|
|
||||||
check_gpus
|
|
||||||
check_hf_token
|
|
||||||
get_current_llm_serving_engine
|
|
||||||
|
|
||||||
pip install -U transformers
|
|
||||||
|
|
||||||
pip install -r requirements/dev.txt
|
|
||||||
which genai-perf
|
|
||||||
|
|
||||||
# check storage
|
|
||||||
df -h
|
|
||||||
|
|
||||||
ensure_installed wget
|
|
||||||
ensure_installed curl
|
|
||||||
ensure_installed jq
|
|
||||||
# genai-perf dependency
|
|
||||||
ensure_installed libb64-0d
|
|
||||||
|
|
||||||
prepare_dataset
|
|
||||||
|
|
||||||
cd "$VLLM_SOURCE_CODE_LOC/benchmarks"
|
|
||||||
declare -g RESULTS_FOLDER=results/
|
|
||||||
mkdir -p $RESULTS_FOLDER
|
|
||||||
BENCHMARK_ROOT="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/"
|
|
||||||
|
|
||||||
# run the test
|
|
||||||
run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"
|
|
||||||
|
|
||||||
# run genai-perf tests
|
|
||||||
run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
|
|
||||||
mv artifacts/ $RESULTS_FOLDER/
|
|
||||||
|
|
||||||
# upload benchmark results to buildkite
|
|
||||||
python3 -m pip install tabulate pandas
|
|
||||||
python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
|
|
||||||
upload_to_buildkite
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
main "$@"
|
|
||||||
@ -1,82 +0,0 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
||||||
|
|
||||||
import datetime
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
from tabulate import tabulate
|
|
||||||
|
|
||||||
results_folder = Path("results/")
|
|
||||||
|
|
||||||
# serving results and the keys that will be printed into markdown
|
|
||||||
serving_results = []
|
|
||||||
serving_column_mapping = {
|
|
||||||
"test_name": "Test name",
|
|
||||||
"gpu_type": "GPU",
|
|
||||||
"completed": "Successful req.",
|
|
||||||
"request_throughput": "Tput (req/s)",
|
|
||||||
"mean_ttft_ms": "Mean TTFT (ms)",
|
|
||||||
"std_ttft_ms": "Std TTFT (ms)",
|
|
||||||
"median_ttft_ms": "Median TTFT (ms)",
|
|
||||||
"mean_itl_ms": "Mean ITL (ms)",
|
|
||||||
"std_itl_ms": "Std ITL (ms)",
|
|
||||||
"median_itl_ms": "Median ITL (ms)",
|
|
||||||
"mean_tpot_ms": "Mean TPOT (ms)",
|
|
||||||
"std_tpot_ms": "Std TPOT (ms)",
|
|
||||||
"median_tpot_ms": "Median TPOT (ms)",
|
|
||||||
"total_token_throughput": "Total Token Tput (tok/s)",
|
|
||||||
"output_throughput": "Output Tput (tok/s)",
|
|
||||||
"total_input_tokens": "Total input tokens",
|
|
||||||
"total_output_tokens": "Total output tokens",
|
|
||||||
"engine": "Engine",
|
|
||||||
}
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
# collect results
|
|
||||||
for test_file in results_folder.glob("*.json"):
|
|
||||||
with open(test_file) as f:
|
|
||||||
raw_result = json.loads(f.read())
|
|
||||||
|
|
||||||
# attach the benchmarking command to raw_result
|
|
||||||
with open(test_file.with_suffix(".commands")) as f:
|
|
||||||
command = json.loads(f.read())
|
|
||||||
raw_result.update(command)
|
|
||||||
|
|
||||||
# update the test name of this result
|
|
||||||
raw_result.update({"test_name": test_file.stem})
|
|
||||||
|
|
||||||
# add the result to raw_result
|
|
||||||
serving_results.append(raw_result)
|
|
||||||
continue
|
|
||||||
|
|
||||||
serving_results = pd.DataFrame.from_dict(serving_results)
|
|
||||||
|
|
||||||
if not serving_results.empty:
|
|
||||||
serving_results = serving_results[list(serving_column_mapping.keys())].rename(
|
|
||||||
columns=serving_column_mapping
|
|
||||||
)
|
|
||||||
|
|
||||||
serving_md_table_with_headers = tabulate(
|
|
||||||
serving_results, headers="keys", tablefmt="pipe", showindex=False
|
|
||||||
)
|
|
||||||
# remove the first line of header
|
|
||||||
serving_md_table_lines = serving_md_table_with_headers.split("\n")
|
|
||||||
serving_md_table_without_header = "\n".join(serving_md_table_lines[2:])
|
|
||||||
|
|
||||||
prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
|
|
||||||
prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
|
|
||||||
|
|
||||||
# document benchmarking results in markdown
|
|
||||||
with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
|
|
||||||
# document results with header.
|
|
||||||
# for those who wants to reproduce our benchmark.
|
|
||||||
f.write(serving_md_table_with_headers)
|
|
||||||
f.write("\n")
|
|
||||||
|
|
||||||
# document benchmarking results in json
|
|
||||||
with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
|
|
||||||
results = serving_results.to_dict(orient="records")
|
|
||||||
f.write(json.dumps(results))
|
|
||||||
@ -1,23 +0,0 @@
|
|||||||
#!/bin/sh
|
|
||||||
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
|
|
||||||
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
|
|
||||||
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
|
|
||||||
else
|
|
||||||
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
|
|
||||||
fi
|
|
||||||
|
|
||||||
TIMEOUT_SECONDS=10
|
|
||||||
|
|
||||||
retries=0
|
|
||||||
while [ $retries -lt 1000 ]; do
|
|
||||||
if [ "$(curl -s --max-time "$TIMEOUT_SECONDS" -L -H "Authorization: Bearer $TOKEN" -o /dev/null -w "%{http_code}" "$URL")" -eq 200 ]; then
|
|
||||||
exit 0
|
|
||||||
fi
|
|
||||||
|
|
||||||
echo "Waiting for image to be available..."
|
|
||||||
|
|
||||||
retries=$((retries + 1))
|
|
||||||
sleep 5
|
|
||||||
done
|
|
||||||
|
|
||||||
exit 1
|
|
||||||
@ -1,30 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "latency_llama8B_tp1",
|
|
||||||
"environment_variables": {
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"num_iters_warmup": 5,
|
|
||||||
"num_iters": 15
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "latency_llama8B_tp4",
|
|
||||||
"environment_variables": {
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"num_iters_warmup": 5,
|
|
||||||
"num_iters": 15
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@ -1,32 +0,0 @@
|
|||||||
[
|
|
||||||
{
|
|
||||||
"test_name": "throughput_llama8B_tp1",
|
|
||||||
"environment_variables": {
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 1,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200,
|
|
||||||
"backend": "vllm"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "throughput_llama8B_tp4",
|
|
||||||
"environment_variables": {
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"load_format": "dummy",
|
|
||||||
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200,
|
|
||||||
"backend": "vllm"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
@ -2,40 +2,23 @@
|
|||||||
|
|
||||||
## Introduction
|
## Introduction
|
||||||
|
|
||||||
This directory contains two sets of benchmark for vllm.
|
This directory contains a benchmarking suite for **developers** to run locally and gain clarity on whether their PR improves/degrades vllm's performance.
|
||||||
|
vLLM also maintains a continuous performance benchmark under [perf.vllm.ai](https://perf.vllm.ai/), hosted under PyTorch CI HUD.
|
||||||
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
|
|
||||||
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
|
|
||||||
|
|
||||||
See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
|
|
||||||
|
|
||||||
## Performance benchmark quick overview
|
## Performance benchmark quick overview
|
||||||
|
|
||||||
**Benchmarking Coverage**: latency, throughput and fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) and Intel® Xeon® Processors, with different models.
|
**Benchmarking Coverage**: latency, throughput and fix-qps serving on B200, A100, H100, Intel® Xeon® Processors and Intel® Gaudi® 3 Accelerators with different models.
|
||||||
|
|
||||||
**Benchmarking Duration**: about 1hr.
|
**Benchmarking Duration**: about 1hr.
|
||||||
|
|
||||||
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
|
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
|
||||||
|
|
||||||
## Nightly benchmark quick overview
|
|
||||||
|
|
||||||
**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
|
|
||||||
|
|
||||||
**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
|
|
||||||
|
|
||||||
**Benchmarking Duration**: about 3.5hrs.
|
|
||||||
|
|
||||||
## Trigger the benchmark
|
## Trigger the benchmark
|
||||||
|
|
||||||
Performance benchmark will be triggered when:
|
The benchmark needs to be triggered manually:
|
||||||
|
|
||||||
- A PR being merged into vllm.
|
|
||||||
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
|
|
||||||
|
|
||||||
Manually Trigger the benchmark
|
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
|
bash .buildkite/performance-benchmarks/scripts/run-performance-benchmarks.sh
|
||||||
```
|
```
|
||||||
|
|
||||||
Runtime environment variables:
|
Runtime environment variables:
|
||||||
@ -47,14 +30,11 @@ Runtime environment variables:
|
|||||||
- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
|
- `REMOTE_HOST`: IP for the remote vLLM service to benchmark. Default value is empty string.
|
||||||
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
|
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
|
||||||
|
|
||||||
Nightly benchmark will be triggered when:
|
|
||||||
|
|
||||||
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
|
|
||||||
|
|
||||||
## Performance benchmark details
|
## Performance benchmark details
|
||||||
|
|
||||||
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
|
||||||
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
|
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
|
||||||
|
For Intel® Gaudi® 3 Accelerators, use `tests/latency-tests-hpu.json`, `tests/throughput-tests-hpu.json`, `tests/serving-tests-hpu.json` instead.
|
||||||
>
|
>
|
||||||
### Latency test
|
### Latency test
|
||||||
|
|
||||||
@ -152,26 +132,3 @@ Here is an example using the script to compare result_a and result_b with Model,
|
|||||||
A comparison diagram will be generated below the table.
|
A comparison diagram will be generated below the table.
|
||||||
Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
|
Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
|
||||||
<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
|
<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
|
||||||
|
|
||||||
## Nightly test details
|
|
||||||
|
|
||||||
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
|
|
||||||
|
|
||||||
### Workflow
|
|
||||||
|
|
||||||
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
|
|
||||||
- Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container.
|
|
||||||
- The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`.
|
|
||||||
- At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
|
|
||||||
|
|
||||||
### Nightly tests
|
|
||||||
|
|
||||||
In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
|
|
||||||
|
|
||||||
### Docker containers
|
|
||||||
|
|
||||||
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
|
|
||||||
|
|
||||||
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`.
|
|
||||||
|
|
||||||
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).
|
|
||||||
@ -5,7 +5,7 @@
|
|||||||
- Input length: 32 tokens.
|
- Input length: 32 tokens.
|
||||||
- Output length: 128 tokens.
|
- Output length: 128 tokens.
|
||||||
- Batch size: fixed (8).
|
- Batch size: fixed (8).
|
||||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
- CPU Models: llama-3.1 8B.
|
- CPU Models: llama-3.1 8B.
|
||||||
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
- Evaluation metrics: end-to-end latency (mean, median, p99).
|
||||||
|
|
||||||
@ -16,7 +16,7 @@
|
|||||||
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
|
||||||
- Output length: the corresponding output length of these 200 prompts.
|
- Output length: the corresponding output length of these 200 prompts.
|
||||||
- Batch size: dynamically determined by vllm to achieve maximum throughput.
|
- Batch size: dynamically determined by vllm to achieve maximum throughput.
|
||||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
- CPU Models: llama-3.1 8B.
|
- CPU Models: llama-3.1 8B.
|
||||||
- Evaluation metrics: throughput.
|
- Evaluation metrics: throughput.
|
||||||
|
|
||||||
@ -28,7 +28,7 @@
|
|||||||
- Output length: the corresponding output length of these 200 prompts.
|
- Output length: the corresponding output length of these 200 prompts.
|
||||||
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
|
- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
|
||||||
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
- **Average QPS (query per second)**: 1, 4, 16 and inf. QPS = inf means all requests come at once. For other QPS values, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
|
||||||
- GPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
- GPU/HPU Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
|
||||||
- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
|
- We also added a speculative decoding test for llama-3 70B on GPU, under QPS 2
|
||||||
- CPU Models: llama-3.1 8B.
|
- CPU Models: llama-3.1 8B.
|
||||||
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
|
||||||
@ -7,6 +7,7 @@ from importlib import util
|
|||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
|
pd.options.display.float_format = "{:.2f}".format
|
||||||
plotly_found = util.find_spec("plotly.express") is not None
|
plotly_found = util.find_spec("plotly.express") is not None
|
||||||
|
|
||||||
|
|
||||||
@ -109,7 +110,10 @@ def compare_data_columns(
|
|||||||
if len(compare_frames) >= 2:
|
if len(compare_frames) >= 2:
|
||||||
base = compare_frames[0]
|
base = compare_frames[0]
|
||||||
current = compare_frames[-1]
|
current = compare_frames[-1]
|
||||||
ratio = current / base
|
if "P99" in data_column or "Median" in data_column:
|
||||||
|
ratio = base / current # for latency
|
||||||
|
else:
|
||||||
|
ratio = current / base
|
||||||
ratio = ratio.mask(base == 0) # avoid inf when baseline is 0
|
ratio = ratio.mask(base == 0) # avoid inf when baseline is 0
|
||||||
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
|
ratio.name = f"Ratio 1 vs {len(compare_frames)}"
|
||||||
frames.append(ratio)
|
frames.append(ratio)
|
||||||
@ -199,6 +203,71 @@ def split_json_by_tp_pp(
|
|||||||
return saved_paths
|
return saved_paths
|
||||||
|
|
||||||
|
|
||||||
|
def _add_limit_line(fig, y_value, label):
|
||||||
|
# Visible dashed line + annotation
|
||||||
|
fig.add_hline(
|
||||||
|
y=y_value,
|
||||||
|
line_dash="dash",
|
||||||
|
line_color="red" if "ttft" in label.lower() else "blue",
|
||||||
|
annotation_text=f"{label}: {y_value} ms",
|
||||||
|
annotation_position="top left",
|
||||||
|
)
|
||||||
|
# Optional: add a legend item (as a transparent helper trace)
|
||||||
|
if plot and plotly_found:
|
||||||
|
import plotly.graph_objects as go
|
||||||
|
|
||||||
|
fig.add_trace(
|
||||||
|
go.Scatter(
|
||||||
|
x=[None],
|
||||||
|
y=[None],
|
||||||
|
mode="lines",
|
||||||
|
line=dict(
|
||||||
|
dash="dash", color="red" if "ttft" in label.lower() else "blue"
|
||||||
|
),
|
||||||
|
name=f"{label}",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _find_concurrency_col(df: pd.DataFrame) -> str:
|
||||||
|
for c in [
|
||||||
|
"# of max concurrency.",
|
||||||
|
"# of max concurrency",
|
||||||
|
"Max Concurrency",
|
||||||
|
"max_concurrency",
|
||||||
|
"Concurrency",
|
||||||
|
]:
|
||||||
|
if c in df.columns:
|
||||||
|
return c
|
||||||
|
# Fallback: guess an integer-like column (harmless if unused)
|
||||||
|
for c in df.columns:
|
||||||
|
if df[c].dtype.kind in "iu" and df[c].nunique() > 1 and df[c].min() >= 1:
|
||||||
|
return c
|
||||||
|
return "# of max concurrency."
|
||||||
|
|
||||||
|
|
||||||
|
def _highlight_threshold(
|
||||||
|
df: pd.DataFrame, threshold: float
|
||||||
|
) -> "pd.io.formats.style.Styler":
|
||||||
|
"""Highlight numeric per-configuration columns with value <= threshold."""
|
||||||
|
conc_col = _find_concurrency_col(df)
|
||||||
|
key_cols = [
|
||||||
|
c
|
||||||
|
for c in ["Model", "Dataset Name", "Input Len", "Output Len", conc_col]
|
||||||
|
if c in df.columns
|
||||||
|
]
|
||||||
|
conf_cols = [
|
||||||
|
c for c in df.columns if c not in key_cols and not str(c).startswith("Ratio")
|
||||||
|
]
|
||||||
|
conf_cols = [c for c in conf_cols if pd.api.types.is_numeric_dtype(df[c])]
|
||||||
|
return df.style.map(
|
||||||
|
lambda v: "background-color:#e6ffe6;font-weight:bold;"
|
||||||
|
if pd.notna(v) and v <= threshold
|
||||||
|
else "",
|
||||||
|
subset=conf_cols,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
@ -220,6 +289,26 @@ if __name__ == "__main__":
|
|||||||
default="# of max concurrency.",
|
default="# of max concurrency.",
|
||||||
help="column name to use as X Axis in comparison graph",
|
help="column name to use as X Axis in comparison graph",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-l",
|
||||||
|
"--latency",
|
||||||
|
type=str,
|
||||||
|
default="p99",
|
||||||
|
help="take median|p99 for latency like TTFT/TPOT",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--ttft-max-ms",
|
||||||
|
type=float,
|
||||||
|
default=3000.0,
|
||||||
|
help="Reference limit for TTFT plots (ms)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--tpot-max-ms",
|
||||||
|
type=float,
|
||||||
|
default=100.0,
|
||||||
|
help="Reference limit for TPOT plots (ms)",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
drop_column = "P99"
|
drop_column = "P99"
|
||||||
@ -234,12 +323,22 @@ if __name__ == "__main__":
|
|||||||
"# of max concurrency.",
|
"# of max concurrency.",
|
||||||
"qps",
|
"qps",
|
||||||
]
|
]
|
||||||
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
|
|
||||||
html_msgs_for_data_cols = [
|
if "median" in args.latency:
|
||||||
"Compare Output Tokens /n",
|
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
|
||||||
"Median TTFT /n",
|
html_msgs_for_data_cols = [
|
||||||
"Median TPOT /n",
|
"Compare Output Tokens /n",
|
||||||
]
|
"Median TTFT /n",
|
||||||
|
"Median TPOT /n",
|
||||||
|
]
|
||||||
|
drop_column = "P99"
|
||||||
|
elif "p99" in args.latency:
|
||||||
|
data_cols_to_compare = ["Output Tput (tok/s)", "P99 TTFT (ms)", "P99"]
|
||||||
|
html_msgs_for_data_cols = [
|
||||||
|
"Compare Output Tokens /n",
|
||||||
|
"P99 TTFT /n",
|
||||||
|
"P99 TPOT /n",
|
||||||
|
]
|
||||||
|
|
||||||
if len(args.file) == 1:
|
if len(args.file) == 1:
|
||||||
files = split_json_by_tp_pp(args.file[0], output_root="splits")
|
files = split_json_by_tp_pp(args.file[0], output_root="splits")
|
||||||
@ -275,33 +374,83 @@ if __name__ == "__main__":
|
|||||||
f"Expected subset: {filtered_info_cols}, "
|
f"Expected subset: {filtered_info_cols}, "
|
||||||
f"but DataFrame has: {list(output_df.columns)}"
|
f"but DataFrame has: {list(output_df.columns)}"
|
||||||
)
|
)
|
||||||
output_df_sorted = output_df.sort_values(by=existing_group_cols)
|
# output_df_sorted = output_df.sort_values(by=existing_group_cols)
|
||||||
|
output_df_sorted = output_df.sort_values(by=args.xaxis)
|
||||||
output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
|
output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
|
||||||
for name, group in output_groups:
|
for name, group in output_groups:
|
||||||
html = group.to_html()
|
group_name = (
|
||||||
|
",".join(map(str, name)).replace(",", "_").replace("/", "-")
|
||||||
|
)
|
||||||
|
group_html_name = "perf_comparison_" + group_name + ".html"
|
||||||
|
|
||||||
|
metric_name = str(data_cols_to_compare[i]).lower()
|
||||||
|
if "tok/s" in metric_name:
|
||||||
|
html = group.to_html()
|
||||||
|
elif "ttft" in metric_name:
|
||||||
|
styler = _highlight_threshold(group, args.ttft_max_ms).format(
|
||||||
|
{c: "{:.2f}" for c in group.select_dtypes("number").columns},
|
||||||
|
na_rep="—",
|
||||||
|
)
|
||||||
|
html = styler.to_html(
|
||||||
|
table_attributes='border="1" class="dataframe"'
|
||||||
|
)
|
||||||
|
elif (
|
||||||
|
"tpot" in metric_name
|
||||||
|
or "median" in metric_name
|
||||||
|
or "p99" in metric_name
|
||||||
|
):
|
||||||
|
styler = _highlight_threshold(group, args.tpot_max_ms).format(
|
||||||
|
{c: "{:.2f}" for c in group.select_dtypes("number").columns},
|
||||||
|
na_rep="—",
|
||||||
|
)
|
||||||
|
html = styler.to_html(
|
||||||
|
table_attributes='border="1" class="dataframe"'
|
||||||
|
)
|
||||||
|
|
||||||
text_file.write(html_msgs_for_data_cols[i])
|
text_file.write(html_msgs_for_data_cols[i])
|
||||||
text_file.write(html)
|
text_file.write(html)
|
||||||
|
with open(group_html_name, "a+") as sub_text_file:
|
||||||
|
sub_text_file.write(html_msgs_for_data_cols[i])
|
||||||
|
sub_text_file.write(html)
|
||||||
|
|
||||||
if plot and plotly_found:
|
if plot and plotly_found:
|
||||||
import plotly.express as px
|
import plotly.express as px
|
||||||
|
|
||||||
df = group[raw_data_cols]
|
df = group[raw_data_cols]
|
||||||
df_sorted = df.sort_values(by=info_cols[y_axis_index])
|
df_sorted = df.sort_values(by=info_cols[y_axis_index])
|
||||||
# Melt DataFrame for plotting
|
# Melt DataFrame for plotting
|
||||||
df_melted = df_sorted.melt(
|
df_melted = df_sorted.melt(
|
||||||
id_vars=info_cols[y_axis_index],
|
id_vars=info_cols[y_axis_index],
|
||||||
var_name="Configuration",
|
var_name="Configuration",
|
||||||
value_name=data_cols_to_compare[i],
|
value_name=data_cols_to_compare[i],
|
||||||
)
|
)
|
||||||
title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
|
title = (
|
||||||
# Create Plotly line chart
|
data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
|
||||||
fig = px.line(
|
)
|
||||||
df_melted,
|
# Create Plotly line chart
|
||||||
x=info_cols[y_axis_index],
|
fig = px.line(
|
||||||
y=data_cols_to_compare[i],
|
df_melted,
|
||||||
color="Configuration",
|
x=info_cols[y_axis_index],
|
||||||
title=title,
|
y=data_cols_to_compare[i],
|
||||||
markers=True,
|
color="Configuration",
|
||||||
)
|
title=title,
|
||||||
# Export to HTML
|
markers=True,
|
||||||
text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))
|
)
|
||||||
|
|
||||||
|
# ---- Add threshold lines based on metric name ----
|
||||||
|
if "ttft" in metric_name:
|
||||||
|
_add_limit_line(fig, args.ttft_max_ms, "TTFT limit")
|
||||||
|
elif (
|
||||||
|
"tpot" in metric_name
|
||||||
|
or "median" in metric_name
|
||||||
|
or "p99" in metric_name
|
||||||
|
):
|
||||||
|
_add_limit_line(fig, args.tpot_max_ms, "TPOT limit")
|
||||||
|
|
||||||
|
# Export to HTML
|
||||||
|
text_file.write(
|
||||||
|
fig.to_html(full_html=True, include_plotlyjs="cdn")
|
||||||
|
)
|
||||||
|
sub_text_file.write(
|
||||||
|
fig.to_html(full_html=True, include_plotlyjs="cdn")
|
||||||
|
)
|
||||||
@ -63,9 +63,11 @@ serving_column_mapping = {
|
|||||||
"mean_ttft_ms": "Mean TTFT (ms)",
|
"mean_ttft_ms": "Mean TTFT (ms)",
|
||||||
"median_ttft_ms": "Median TTFT (ms)",
|
"median_ttft_ms": "Median TTFT (ms)",
|
||||||
"p99_ttft_ms": "P99 TTFT (ms)",
|
"p99_ttft_ms": "P99 TTFT (ms)",
|
||||||
|
"std_ttft_ms": "STD TTFT (ms)",
|
||||||
"mean_tpot_ms": "Mean TPOT (ms)",
|
"mean_tpot_ms": "Mean TPOT (ms)",
|
||||||
"median_tpot_ms": "Median",
|
"median_tpot_ms": "Median",
|
||||||
"p99_tpot_ms": "P99",
|
"p99_tpot_ms": "P99",
|
||||||
|
"std_tpot_ms": "STD TPOT (ms)",
|
||||||
"mean_itl_ms": "Mean ITL (ms)",
|
"mean_itl_ms": "Mean ITL (ms)",
|
||||||
"median_itl_ms": "Median ITL (ms)",
|
"median_itl_ms": "Median ITL (ms)",
|
||||||
"p99_itl_ms": "P99 ITL (ms)",
|
"p99_itl_ms": "P99 ITL (ms)",
|
||||||
@ -368,7 +370,7 @@ if __name__ == "__main__":
|
|||||||
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
|
||||||
# we want to turn it into "8xGPUTYPE"
|
# we want to turn it into "8xGPUTYPE"
|
||||||
df["GPU"] = df["GPU"].apply(
|
df["GPU"] = df["GPU"].apply(
|
||||||
lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
|
lambda x: "{}x{}".format(len(x.split("\n")), x.split("\n")[0])
|
||||||
)
|
)
|
||||||
|
|
||||||
# get markdown tables
|
# get markdown tables
|
||||||
@ -390,7 +392,7 @@ if __name__ == "__main__":
|
|||||||
json_file = "benchmark_results.json"
|
json_file = "benchmark_results.json"
|
||||||
with open(results_folder / md_file, "w") as f:
|
with open(results_folder / md_file, "w") as f:
|
||||||
results = read_markdown(
|
results = read_markdown(
|
||||||
"../.buildkite/nightly-benchmarks/"
|
"../.buildkite/performance-benchmarks/"
|
||||||
+ "performance-benchmarks-descriptions.md"
|
+ "performance-benchmarks-descriptions.md"
|
||||||
)
|
)
|
||||||
results = results.format(
|
results = results.format(
|
||||||
@ -15,6 +15,8 @@ check_gpus() {
|
|||||||
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
|
||||||
elif command -v amd-smi; then
|
elif command -v amd-smi; then
|
||||||
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
|
declare -g gpu_count=$(amd-smi list | grep 'GPU' | wc -l)
|
||||||
|
elif command -v hl-smi; then
|
||||||
|
declare -g gpu_count=$(hl-smi --list | grep -i "Module ID" | wc -l)
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [[ $gpu_count -gt 0 ]]; then
|
if [[ $gpu_count -gt 0 ]]; then
|
||||||
@ -23,10 +25,16 @@ check_gpus() {
|
|||||||
echo "Need at least 1 GPU to run benchmarking."
|
echo "Need at least 1 GPU to run benchmarking."
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
declare -g arch_suffix=''
|
||||||
|
|
||||||
if command -v nvidia-smi; then
|
if command -v nvidia-smi; then
|
||||||
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
declare -g gpu_type=$(nvidia-smi --query-gpu=name --format=csv,noheader | awk '{print $2}')
|
||||||
elif command -v amd-smi; then
|
elif command -v amd-smi; then
|
||||||
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
|
declare -g gpu_type=$(amd-smi static -g 0 -a | grep 'MARKET_NAME' | awk '{print $2}')
|
||||||
|
elif command -v hl-smi; then
|
||||||
|
declare -g gpu_type=$(hl-smi -q | grep "Product Name" | head -n 1 | awk -F ':' '{print $2}' | sed 's/^ *//')
|
||||||
|
arch_suffix='-hpu'
|
||||||
fi
|
fi
|
||||||
echo "GPU type is $gpu_type"
|
echo "GPU type is $gpu_type"
|
||||||
}
|
}
|
||||||
@ -138,6 +146,10 @@ kill_gpu_processes() {
|
|||||||
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
|
while [ "$(amd-smi metric -g 0 | grep 'USED_VRAM' | awk '{print $2}')" -ge 1000 ]; do
|
||||||
sleep 1
|
sleep 1
|
||||||
done
|
done
|
||||||
|
elif command -v hl-smi; then
|
||||||
|
while [ "$(hl-smi -q | grep "Used" | head -n 1 | awk '{print $3}')" -ge 1000 ]; do
|
||||||
|
sleep 1
|
||||||
|
done
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# remove vllm config file
|
# remove vllm config file
|
||||||
@ -451,14 +463,10 @@ main() {
|
|||||||
ARCH='-cpu'
|
ARCH='-cpu'
|
||||||
else
|
else
|
||||||
check_gpus
|
check_gpus
|
||||||
|
ARCH="$arch_suffix"
|
||||||
fi
|
fi
|
||||||
check_hf_token
|
check_hf_token
|
||||||
|
|
||||||
# Set to v1 to run v1 benchmark
|
|
||||||
if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
|
|
||||||
export VLLM_USE_V1=1
|
|
||||||
fi
|
|
||||||
|
|
||||||
# dependencies
|
# dependencies
|
||||||
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
(which jq) || (apt-get update && apt-get -y install jq)
|
(which jq) || (apt-get update && apt-get -y install jq)
|
||||||
@ -474,7 +482,12 @@ main() {
|
|||||||
ensure_sharegpt_downloaded
|
ensure_sharegpt_downloaded
|
||||||
declare -g RESULTS_FOLDER=results/
|
declare -g RESULTS_FOLDER=results/
|
||||||
mkdir -p $RESULTS_FOLDER
|
mkdir -p $RESULTS_FOLDER
|
||||||
QUICK_BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
|
QUICK_BENCHMARK_ROOT=../.buildkite/performance-benchmarks/
|
||||||
|
|
||||||
|
# dump vllm info via vllm collect-env
|
||||||
|
env_output=$(vllm collect-env)
|
||||||
|
|
||||||
|
echo "$env_output" >"$RESULTS_FOLDER/vllm_env.txt"
|
||||||
|
|
||||||
# benchmarking
|
# benchmarking
|
||||||
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
|
run_serving_tests $QUICK_BENCHMARK_ROOT/tests/"${SERVING_JSON:-serving-tests$ARCH.json}"
|
||||||
@ -0,0 +1,26 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "latency_llama8B_tp2",
|
||||||
|
"environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"num_iters_warmup": 5,
|
||||||
|
"num_iters": 15
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
@ -0,0 +1,55 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "latency_llama8B_tp1",
|
||||||
|
"environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"num-iters-warmup": 5,
|
||||||
|
"num-iters": 15,
|
||||||
|
"max-model-len": 256,
|
||||||
|
"async-scheduling": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "latency_llama70B_tp4",
|
||||||
|
"environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"num-iters-warmup": 5,
|
||||||
|
"num-iters": 15,
|
||||||
|
"max-model-len": 256,
|
||||||
|
"async-scheduling": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "latency_mixtral8x7B_tp2",
|
||||||
|
"environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"num-iters-warmup": 5,
|
||||||
|
"num-iters": 15,
|
||||||
|
"max-model-len": 256,
|
||||||
|
"async-scheduling": ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
@ -95,6 +95,38 @@
|
|||||||
"num_prompts": 200
|
"num_prompts": 200
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_bf16_tp4_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
|
"test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
@ -233,6 +265,41 @@
|
|||||||
"num_prompts": 1000
|
"num_prompts": 1000
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_bf16_tp4_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
|
"test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
@ -365,6 +432,38 @@
|
|||||||
"num_prompts": 200
|
"num_prompts": 200
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_tp4_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
|
"test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
@ -503,6 +602,41 @@
|
|||||||
"num_prompts": 1000
|
"num_prompts": 1000
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int8_tp4_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
|
"test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
@ -638,6 +772,39 @@
|
|||||||
"num_prompts": 200
|
"num_prompts": 200
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_tp4_sharegpt",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"quantization": "awq",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
|
"test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
@ -780,6 +947,42 @@
|
|||||||
"num_prompts": 1000
|
"num_prompts": 1000
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_int4_tp4_random_128_128",
|
||||||
|
"qps_list": ["inf"],
|
||||||
|
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"quantization": "awq",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 1000
|
||||||
|
}
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
|
"test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
|
||||||
"qps_list": ["inf"],
|
"qps_list": ["inf"],
|
||||||
@ -2,7 +2,7 @@
|
|||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp1_sharegpt",
|
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
"max_concurrency_list": [32],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
@ -28,13 +28,13 @@
|
|||||||
"backend": "vllm",
|
"backend": "vllm",
|
||||||
"dataset_name": "sharegpt",
|
"dataset_name": "sharegpt",
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
"num_prompts": 200
|
"num_prompts": 32
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp2_sharegpt",
|
"test_name": "serving_llama8B_tp2_sharegpt",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
"max_concurrency_list": [32],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
@ -60,13 +60,13 @@
|
|||||||
"backend": "vllm",
|
"backend": "vllm",
|
||||||
"dataset_name": "sharegpt",
|
"dataset_name": "sharegpt",
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
"num_prompts": 200
|
"num_prompts": 32
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_tp4_sharegpt",
|
"test_name": "serving_llama8B_tp1_random_128_128",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
"max_concurrency_list": [32],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
@ -76,39 +76,7 @@
|
|||||||
},
|
},
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
"tensor_parallel_size": 4,
|
"tensor_parallel_size": 1,
|
||||||
"dtype": "bfloat16",
|
|
||||||
"distributed_executor_backend": "mp",
|
|
||||||
"block_size": 128,
|
|
||||||
"trust_remote_code": "",
|
|
||||||
"disable_log_stats": "",
|
|
||||||
"enforce_eager": "",
|
|
||||||
"max_num_batched_tokens": 2048,
|
|
||||||
"max_num_seqs": 256,
|
|
||||||
"load_format": "dummy"
|
|
||||||
},
|
|
||||||
"client_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"backend": "vllm",
|
|
||||||
"dataset_name": "sharegpt",
|
|
||||||
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
|
||||||
"num_prompts": 200
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"test_name": "serving_llama8B_tp4_random_1024_128",
|
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
|
||||||
"server_environment_variables": {
|
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
|
||||||
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
|
||||||
"VLLM_CPU_SGL_KERNEL": 1,
|
|
||||||
"VLLM_CPU_KVCACHE_SPACE": 40
|
|
||||||
},
|
|
||||||
"server_parameters": {
|
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
|
||||||
"tensor_parallel_size": 4,
|
|
||||||
"dtype": "bfloat16",
|
"dtype": "bfloat16",
|
||||||
"distributed_executor_backend": "mp",
|
"distributed_executor_backend": "mp",
|
||||||
"block_size": 128,
|
"block_size": 128,
|
||||||
@ -124,16 +92,16 @@
|
|||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
"backend": "vllm",
|
"backend": "vllm",
|
||||||
"dataset_name": "random",
|
"dataset_name": "random",
|
||||||
"random-input-len": 1024,
|
"random-input-len": 128,
|
||||||
"random-output-len": 128,
|
"random-output-len": 128,
|
||||||
"ignore-eos": "",
|
"ignore-eos": "",
|
||||||
"num_prompts": 100
|
"num_prompts": 32
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"test_name": "serving_llama8B_pp6_random_1024_128",
|
"test_name": "serving_llama8B_tp2_random_128_128",
|
||||||
"qps_list": [1, 4, 16, "inf"],
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
|
"max_concurrency_list": [32],
|
||||||
"server_environment_variables": {
|
"server_environment_variables": {
|
||||||
"VLLM_RPC_TIMEOUT": 100000,
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
@ -143,7 +111,7 @@
|
|||||||
},
|
},
|
||||||
"server_parameters": {
|
"server_parameters": {
|
||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
"pipeline_parallel_size": 6,
|
"tensor_parallel_size": 2,
|
||||||
"dtype": "bfloat16",
|
"dtype": "bfloat16",
|
||||||
"distributed_executor_backend": "mp",
|
"distributed_executor_backend": "mp",
|
||||||
"block_size": 128,
|
"block_size": 128,
|
||||||
@ -159,10 +127,150 @@
|
|||||||
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
"backend": "vllm",
|
"backend": "vllm",
|
||||||
"dataset_name": "random",
|
"dataset_name": "random",
|
||||||
"random-input-len": 1024,
|
"random-input-len": 128,
|
||||||
"random-output-len": 128,
|
"random-output-len": 128,
|
||||||
"ignore-eos": "",
|
"ignore-eos": "",
|
||||||
"num_prompts": 100
|
"num_prompts": 32
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_random_128_2048",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"max_concurrency_list": [32],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 2048,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 32
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2_random_128_2048",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"max_concurrency_list": [32],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 128,
|
||||||
|
"random-output-len": 2048,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 32
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_random_2048_128",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"max_concurrency_list": [32],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 2048,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 32
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp2_random_2048_128",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"max_concurrency_list": [32],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"enable_chunked_prefill": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"load_format": "dummy"
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "random",
|
||||||
|
"random-input-len": 2048,
|
||||||
|
"random-output-len": 128,
|
||||||
|
"ignore-eos": "",
|
||||||
|
"num_prompts": 32
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@ -0,0 +1,82 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama8B_tp1_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"swap_space": 16,
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"load_format": "dummy",
|
||||||
|
"max-model-len": 2048,
|
||||||
|
"max-num-seqs": 256,
|
||||||
|
"async-scheduling": ""
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_llama70B_tp4_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"swap_space": 16,
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"load_format": "dummy",
|
||||||
|
"max-model-len": 2048,
|
||||||
|
"max-num-seqs": 256,
|
||||||
|
"async-scheduling": ""
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "serving_mixtral8x7B_tp2_sharegpt",
|
||||||
|
"qps_list": [1, 4, 16, "inf"],
|
||||||
|
"server_environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"server_parameters": {
|
||||||
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"swap_space": 16,
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"load_format": "dummy",
|
||||||
|
"max-model-len": 2048,
|
||||||
|
"max-num-seqs": 256,
|
||||||
|
"async-scheduling": ""
|
||||||
|
},
|
||||||
|
"client_parameters": {
|
||||||
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"backend": "vllm",
|
||||||
|
"dataset_name": "sharegpt",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
@ -0,0 +1,27 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "throughput_llama8B_tp2",
|
||||||
|
"environment_variables": {
|
||||||
|
"VLLM_RPC_TIMEOUT": 100000,
|
||||||
|
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
|
||||||
|
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
|
||||||
|
"VLLM_CPU_SGL_KERNEL": 1,
|
||||||
|
"VLLM_CPU_KVCACHE_SPACE": 40
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"dtype": "bfloat16",
|
||||||
|
"distributed_executor_backend": "mp",
|
||||||
|
"block_size": 128,
|
||||||
|
"trust_remote_code": "",
|
||||||
|
"disable_log_stats": "",
|
||||||
|
"enforce_eager": "",
|
||||||
|
"max_num_batched_tokens": 2048,
|
||||||
|
"max_num_seqs": 256,
|
||||||
|
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 200,
|
||||||
|
"backend": "vllm"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
@ -0,0 +1,61 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"test_name": "throughput_llama8B_tp1",
|
||||||
|
"environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||||
|
"tensor_parallel_size": 1,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 1000,
|
||||||
|
"backend": "vllm",
|
||||||
|
"max-model-len": 2048,
|
||||||
|
"max-num-seqs": 512,
|
||||||
|
"async-scheduling": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "throughput_llama70B_tp4",
|
||||||
|
"environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
|
||||||
|
"tensor_parallel_size": 4,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 1000,
|
||||||
|
"backend": "vllm",
|
||||||
|
"max-model-len": 2048,
|
||||||
|
"max-num-seqs": 512,
|
||||||
|
"async-scheduling": ""
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"test_name": "throughput_mixtral8x7B_tp2",
|
||||||
|
"environment_variables": {
|
||||||
|
"PT_HPU_LAZY_MODE": 1,
|
||||||
|
"PT_HPU_ENABLE_LAZY_COLLECTIVES": 1,
|
||||||
|
"VLLM_CONTIGUOUS_PA": 1,
|
||||||
|
"VLLM_DEFRAG": 1
|
||||||
|
},
|
||||||
|
"parameters": {
|
||||||
|
"model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
|
"tensor_parallel_size": 2,
|
||||||
|
"load_format": "dummy",
|
||||||
|
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
|
||||||
|
"num_prompts": 1000,
|
||||||
|
"backend": "vllm",
|
||||||
|
"max-model-len": 2048,
|
||||||
|
"max-num-seqs": 512,
|
||||||
|
"async-scheduling": ""
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
@ -1,5 +1,5 @@
|
|||||||
steps:
|
steps:
|
||||||
# aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
|
# aarch64 + CUDA builds
|
||||||
- label: "Build arm64 wheel - CUDA 12.9"
|
- label: "Build arm64 wheel - CUDA 12.9"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-wheel-arm64-cuda-12-9
|
id: build-wheel-arm64-cuda-12-9
|
||||||
@ -8,13 +8,28 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
|
||||||
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
# aarch64 build
|
||||||
|
- label: "Build arm64 CPU wheel"
|
||||||
|
depends_on: ~
|
||||||
|
id: build-wheel-arm64-cpu
|
||||||
|
agents:
|
||||||
|
queue: arm64_cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target vllm-build --progress plain -f docker/Dockerfile.cpu ."
|
||||||
|
- "mkdir artifacts"
|
||||||
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
# x86 + CUDA builds
|
||||||
- label: "Build wheel - CUDA 12.8"
|
- label: "Build wheel - CUDA 12.8"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-wheel-cuda-12-8
|
id: build-wheel-cuda-12-8
|
||||||
@ -28,33 +43,33 @@ steps:
|
|||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build wheel - CUDA 12.6"
|
|
||||||
depends_on: ~
|
|
||||||
id: build-wheel-cuda-12-6
|
|
||||||
agents:
|
|
||||||
queue: cpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
|
||||||
- "mkdir artifacts"
|
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
# x86 + CUDA builds
|
|
||||||
- label: "Build wheel - CUDA 12.9"
|
- label: "Build wheel - CUDA 12.9"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-wheel-cuda-12-9
|
id: build-wheel-cuda-12-9
|
||||||
agents:
|
agents:
|
||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
- "mkdir artifacts"
|
- "mkdir artifacts"
|
||||||
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
- "bash .buildkite/scripts/upload-wheels.sh"
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
- label: "Build wheel - CUDA 13.0"
|
||||||
|
depends_on: ~
|
||||||
|
id: build-wheel-cuda-13-0
|
||||||
|
agents:
|
||||||
|
queue: cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
|
||||||
|
- "mkdir artifacts"
|
||||||
|
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
|
||||||
|
- "bash .buildkite/scripts/upload-wheels.sh"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
# Build release images (12.9)
|
||||||
- label: "Build release image (x86)"
|
- label: "Build release image (x86)"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-release-image-x86
|
id: build-release-image-x86
|
||||||
@ -62,13 +77,12 @@ steps:
|
|||||||
queue: cpu_queue_postmerge
|
queue: cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||||
# re-tag to default image tag and push, just in case arm64 build fails
|
# re-tag to default image tag and push, just in case arm64 build fails
|
||||||
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||||
|
|
||||||
# PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
|
|
||||||
- label: "Build release image (arm64)"
|
- label: "Build release image (arm64)"
|
||||||
depends_on: ~
|
depends_on: ~
|
||||||
id: build-release-image-arm64
|
id: build-release-image-arm64
|
||||||
@ -76,7 +90,7 @@ steps:
|
|||||||
queue: arm64_cpu_queue_postmerge
|
queue: arm64_cpu_queue_postmerge
|
||||||
commands:
|
commands:
|
||||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
|
||||||
|
|
||||||
# Add job to create multi-arch manifest
|
# Add job to create multi-arch manifest
|
||||||
@ -102,24 +116,6 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- "bash .buildkite/scripts/annotate-release.sh"
|
- "bash .buildkite/scripts/annotate-release.sh"
|
||||||
|
|
||||||
- label: "Build and publish TPU release image"
|
|
||||||
depends_on: ~
|
|
||||||
if: build.env("NIGHTLY") == "1"
|
|
||||||
agents:
|
|
||||||
queue: tpu_queue_postmerge
|
|
||||||
commands:
|
|
||||||
- "yes | docker system prune -a"
|
|
||||||
- "git fetch --all"
|
|
||||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --tag vllm/vllm-tpu:nightly --tag vllm/vllm-tpu:$BUILDKITE_COMMIT --progress plain -f docker/Dockerfile.tpu ."
|
|
||||||
- "docker push vllm/vllm-tpu:nightly"
|
|
||||||
- "docker push vllm/vllm-tpu:$BUILDKITE_COMMIT"
|
|
||||||
plugins:
|
|
||||||
- docker-login#v3.0.0:
|
|
||||||
username: vllmbot
|
|
||||||
password-env: DOCKERHUB_TOKEN
|
|
||||||
env:
|
|
||||||
DOCKER_BUILDKIT: "1"
|
|
||||||
|
|
||||||
- input: "Provide Release version here"
|
- input: "Provide Release version here"
|
||||||
id: input-release-version
|
id: input-release-version
|
||||||
fields:
|
fields:
|
||||||
@ -142,6 +138,22 @@ steps:
|
|||||||
env:
|
env:
|
||||||
DOCKER_BUILDKIT: "1"
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
|
- block: "Build arm64 CPU release image"
|
||||||
|
key: block-arm64-cpu-release-image-build
|
||||||
|
depends_on: ~
|
||||||
|
|
||||||
|
- label: "Build and publish arm64 CPU release image"
|
||||||
|
depends_on: block-arm64-cpu-release-image-build
|
||||||
|
agents:
|
||||||
|
queue: arm64_cpu_queue_postmerge
|
||||||
|
commands:
|
||||||
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
|
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||||
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
|
||||||
|
- "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
|
||||||
- label: "Build and publish nightly multi-arch image to DockerHub"
|
- label: "Build and publish nightly multi-arch image to DockerHub"
|
||||||
depends_on:
|
depends_on:
|
||||||
- create-multi-arch-manifest
|
- create-multi-arch-manifest
|
||||||
|
|||||||
@ -2,16 +2,23 @@
|
|||||||
|
|
||||||
set -ex
|
set -ex
|
||||||
|
|
||||||
# Get release version and strip leading 'v' if present
|
# Get release version, default to 1.0.0.dev for nightly/per-commit builds
|
||||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
|
RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null | sed 's/^v//')
|
||||||
|
if [ -z "${RELEASE_VERSION}" ]; then
|
||||||
if [ -z "$RELEASE_VERSION" ]; then
|
RELEASE_VERSION="1.0.0.dev"
|
||||||
echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
|
|
||||||
exit 1
|
|
||||||
fi
|
fi
|
||||||
|
|
||||||
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
|
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
|
||||||
To download the wheel:
|
To download the wheel (by commit):
|
||||||
|
\`\`\`
|
||||||
|
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
|
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
|
||||||
|
|
||||||
|
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
|
aws s3 cp s3://vllm-wheels/${BUILDKITE_COMMIT}/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
|
\`\`\`
|
||||||
|
|
||||||
|
To download the wheel (by version):
|
||||||
\`\`\`
|
\`\`\`
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
|
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
|
||||||
|
|||||||
@ -173,6 +173,14 @@ fi
|
|||||||
PARALLEL_JOB_COUNT=8
|
PARALLEL_JOB_COUNT=8
|
||||||
MYPYTHONPATH=".."
|
MYPYTHONPATH=".."
|
||||||
|
|
||||||
|
# Test that we're launching on the machine that has
|
||||||
|
# proper access to GPUs
|
||||||
|
render_gid=$(getent group render | cut -d: -f3)
|
||||||
|
if [[ -z "$render_gid" ]]; then
|
||||||
|
echo "Error: 'render' group not found. This is required for GPU access." >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
|
||||||
if [[ $commands == *"--shard-id="* ]]; then
|
if [[ $commands == *"--shard-id="* ]]; then
|
||||||
# assign job count as the number of shards used
|
# assign job count as the number of shards used
|
||||||
@ -186,6 +194,7 @@ if [[ $commands == *"--shard-id="* ]]; then
|
|||||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||||
--network=host \
|
--network=host \
|
||||||
--shm-size=16gb \
|
--shm-size=16gb \
|
||||||
|
--group-add "$render_gid" \
|
||||||
--rm \
|
--rm \
|
||||||
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
-e HIP_VISIBLE_DEVICES="${GPU}" \
|
||||||
-e HF_TOKEN \
|
-e HF_TOKEN \
|
||||||
@ -217,8 +226,8 @@ else
|
|||||||
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
--device /dev/kfd $BUILDKITE_AGENT_META_DATA_RENDER_DEVICES \
|
||||||
--network=host \
|
--network=host \
|
||||||
--shm-size=16gb \
|
--shm-size=16gb \
|
||||||
|
--group-add "$render_gid" \
|
||||||
--rm \
|
--rm \
|
||||||
-e HIP_VISIBLE_DEVICES=0 \
|
|
||||||
-e HF_TOKEN \
|
-e HF_TOKEN \
|
||||||
-e AWS_ACCESS_KEY_ID \
|
-e AWS_ACCESS_KEY_ID \
|
||||||
-e AWS_SECRET_ACCESS_KEY \
|
-e AWS_SECRET_ACCESS_KEY \
|
||||||
|
|||||||
@ -25,25 +25,28 @@ function cpu_tests() {
|
|||||||
|
|
||||||
# offline inference
|
# offline inference
|
||||||
podman exec -it "$container_id" bash -c "
|
podman exec -it "$container_id" bash -c "
|
||||||
set -e
|
set -xve
|
||||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
|
||||||
|
|
||||||
# Run basic model test
|
# Run basic model test
|
||||||
podman exec -it "$container_id" bash -c "
|
podman exec -it "$container_id" bash -c "
|
||||||
set -e
|
set -evx
|
||||||
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
|
||||||
pip install sentence-transformers datamodel_code_generator
|
pip install sentence-transformers datamodel_code_generator
|
||||||
pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
|
||||||
|
# Note: disable Bart until supports V1
|
||||||
|
# pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
||||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
||||||
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
||||||
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
|
# TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
|
||||||
|
# pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
|
||||||
}
|
}
|
||||||
|
|
||||||
# All of CPU tests are expected to be finished less than 40 mins.
|
# All of CPU tests are expected to be finished less than 40 mins.
|
||||||
|
|
||||||
export container_id
|
export container_id
|
||||||
export -f cpu_tests
|
export -f cpu_tests
|
||||||
timeout 40m bash -c cpu_tests
|
timeout 120m bash -c cpu_tests
|
||||||
|
|
||||||
|
|||||||
@ -70,7 +70,7 @@ function cpu_tests() {
|
|||||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||||
set -e
|
set -e
|
||||||
pytest -x -s -v \
|
pytest -x -s -v \
|
||||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
|
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
|
||||||
|
|
||||||
# Note: disable it until supports V1
|
# Note: disable it until supports V1
|
||||||
# Run AWQ test
|
# Run AWQ test
|
||||||
|
|||||||
@ -64,10 +64,9 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git
|
|||||||
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
||||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
||||||
echo "--- Python dependencies installed ---"
|
echo "--- Python dependencies installed ---"
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||||
export VLLM_XLA_CACHE_PATH=
|
export VLLM_XLA_CACHE_PATH=
|
||||||
echo "Using VLLM V1"
|
|
||||||
|
|
||||||
echo "--- Hardware Information ---"
|
echo "--- Hardware Information ---"
|
||||||
# tpu-info
|
# tpu-info
|
||||||
|
|||||||
@ -64,10 +64,9 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git
|
|||||||
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
&& python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
|
||||||
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
&& python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
|
||||||
echo "--- Python dependencies installed ---"
|
echo "--- Python dependencies installed ---"
|
||||||
export VLLM_USE_V1=1
|
|
||||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||||
export VLLM_XLA_CACHE_PATH=
|
export VLLM_XLA_CACHE_PATH=
|
||||||
echo "Using VLLM V1"
|
|
||||||
|
|
||||||
echo "--- Hardware Information ---"
|
echo "--- Hardware Information ---"
|
||||||
# tpu-info
|
# tpu-info
|
||||||
|
|||||||
@ -20,7 +20,10 @@ trap remove_docker_container EXIT
|
|||||||
|
|
||||||
# Run the image and test offline inference/tensor parallel
|
# Run the image and test offline inference/tensor parallel
|
||||||
docker run \
|
docker run \
|
||||||
--device /dev/dri \
|
--device /dev/dri:/dev/dri \
|
||||||
|
--net=host \
|
||||||
|
--ipc=host \
|
||||||
|
--privileged \
|
||||||
-v /dev/dri/by-path:/dev/dri/by-path \
|
-v /dev/dri/by-path:/dev/dri/by-path \
|
||||||
--entrypoint="" \
|
--entrypoint="" \
|
||||||
-e "HF_TOKEN=${HF_TOKEN}" \
|
-e "HF_TOKEN=${HF_TOKEN}" \
|
||||||
@ -42,8 +45,7 @@ docker run \
|
|||||||
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
|
||||||
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
|
||||||
pytest -v -s v1/structured_output
|
pytest -v -s v1/structured_output
|
||||||
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
|
pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py --ignore=v1/spec_decode/test_speculators_eagle3.py
|
||||||
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
|
||||||
pytest -v -s v1/test_metrics
|
|
||||||
pytest -v -s v1/test_serial_utils.py
|
pytest -v -s v1/test_serial_utils.py
|
||||||
'
|
'
|
||||||
|
|||||||
@ -0,0 +1,62 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euxo pipefail
|
||||||
|
|
||||||
|
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
||||||
|
THRESHOLD=${1:-0.25}
|
||||||
|
NUM_Q=${2:-1319}
|
||||||
|
PORT=${3:-8010}
|
||||||
|
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
||||||
|
mkdir -p "${OUT_DIR}"
|
||||||
|
|
||||||
|
wait_for_server() {
|
||||||
|
local port=$1
|
||||||
|
timeout 600 bash -c '
|
||||||
|
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
||||||
|
sleep 1
|
||||||
|
done'
|
||||||
|
}
|
||||||
|
|
||||||
|
MODEL="deepseek-ai/DeepSeek-V2-lite"
|
||||||
|
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||||
|
kill "${SERVER_PID}" 2>/dev/null || true
|
||||||
|
for _ in {1..20}; do
|
||||||
|
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
||||||
|
sleep 0.5
|
||||||
|
done
|
||||||
|
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
for BACK in "${BACKENDS[@]}"; do
|
||||||
|
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||||
|
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||||
|
vllm serve "$MODEL" \
|
||||||
|
--enforce-eager \
|
||||||
|
--tensor-parallel-size 2 \
|
||||||
|
--data-parallel-size 2 \
|
||||||
|
--enable-expert-parallel \
|
||||||
|
--enable-eplb \
|
||||||
|
--trust-remote-code \
|
||||||
|
--max-model-len 2048 \
|
||||||
|
--port $PORT &
|
||||||
|
SERVER_PID=$!
|
||||||
|
wait_for_server $PORT
|
||||||
|
|
||||||
|
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||||
|
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
||||||
|
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
||||||
|
python3 - <<PY
|
||||||
|
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||||
|
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||||
|
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
||||||
|
PY
|
||||||
|
|
||||||
|
cleanup
|
||||||
|
SERVER_PID=
|
||||||
|
sleep 1
|
||||||
|
PORT=$((PORT+1))
|
||||||
|
done
|
||||||
@ -0,0 +1,61 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
set -euxo pipefail
|
||||||
|
|
||||||
|
# args: [THRESHOLD] [NUM_QUESTIONS] [START_PORT]
|
||||||
|
THRESHOLD=${1:-0.8}
|
||||||
|
NUM_Q=${2:-1319}
|
||||||
|
PORT=${3:-8020}
|
||||||
|
OUT_DIR=${OUT_DIR:-/tmp/vllm-scheduled}
|
||||||
|
mkdir -p "${OUT_DIR}"
|
||||||
|
|
||||||
|
wait_for_server() {
|
||||||
|
local port=$1
|
||||||
|
timeout 600 bash -c '
|
||||||
|
until curl -sf "http://127.0.0.1:'"$port"'/health" > /dev/null; do
|
||||||
|
sleep 1
|
||||||
|
done'
|
||||||
|
}
|
||||||
|
|
||||||
|
MODEL="QWen/Qwen3-30B-A3B-FP8"
|
||||||
|
BACKENDS=("deepep_high_throughput" "deepep_low_latency")
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
if [[ -n "${SERVER_PID:-}" ]] && kill -0 "${SERVER_PID}" 2>/dev/null; then
|
||||||
|
kill "${SERVER_PID}" 2>/dev/null || true
|
||||||
|
for _ in {1..20}; do
|
||||||
|
kill -0 "${SERVER_PID}" 2>/dev/null || break
|
||||||
|
sleep 0.5
|
||||||
|
done
|
||||||
|
kill -9 "${SERVER_PID}" 2>/dev/null || true
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
trap cleanup EXIT
|
||||||
|
|
||||||
|
for BACK in "${BACKENDS[@]}"; do
|
||||||
|
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||||
|
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||||
|
vllm serve "$MODEL" \
|
||||||
|
--enforce-eager \
|
||||||
|
--tensor-parallel-size 2 \
|
||||||
|
--data-parallel-size 2 \
|
||||||
|
--enable-expert-parallel \
|
||||||
|
--trust-remote-code \
|
||||||
|
--max-model-len 2048 \
|
||||||
|
--port $PORT &
|
||||||
|
SERVER_PID=$!
|
||||||
|
wait_for_server $PORT
|
||||||
|
|
||||||
|
TAG=$(echo "$MODEL" | tr '/: \\n' '_____')
|
||||||
|
OUT="${OUT_DIR}/${TAG}_${BACK}.json"
|
||||||
|
python3 tests/evals/gsm8k/gsm8k_eval.py --host http://127.0.0.1 --port $PORT --num-questions ${NUM_Q} --save-results ${OUT}
|
||||||
|
python3 - <<PY
|
||||||
|
import json; acc=json.load(open('${OUT}'))['accuracy']
|
||||||
|
print(f"${MODEL} ${BACK}: accuracy {acc:.3f}")
|
||||||
|
assert acc >= ${THRESHOLD}, f"${MODEL} ${BACK} accuracy {acc}"
|
||||||
|
PY
|
||||||
|
|
||||||
|
cleanup
|
||||||
|
SERVER_PID=
|
||||||
|
sleep 1
|
||||||
|
PORT=$((PORT+1))
|
||||||
|
done
|
||||||
@ -9,6 +9,6 @@ MAX_NUM_BATCHED_TOKENS=1024
|
|||||||
TENSOR_PARALLEL_SIZE=1
|
TENSOR_PARALLEL_SIZE=1
|
||||||
MAX_MODEL_LEN=2048
|
MAX_MODEL_LEN=2048
|
||||||
DOWNLOAD_DIR=/mnt/disks/persist
|
DOWNLOAD_DIR=/mnt/disks/persist
|
||||||
EXPECTED_THROUGHPUT=10.0
|
EXPECTED_THROUGHPUT=8.7
|
||||||
INPUT_LEN=1800
|
INPUT_LEN=1800
|
||||||
OUTPUT_LEN=128
|
OUTPUT_LEN=128
|
||||||
|
|||||||
@ -42,7 +42,7 @@ echo "lanching vllm..."
|
|||||||
echo "logging to $VLLM_LOG"
|
echo "logging to $VLLM_LOG"
|
||||||
echo
|
echo
|
||||||
|
|
||||||
VLLM_USE_V1=1 vllm serve $MODEL \
|
vllm serve $MODEL \
|
||||||
--seed 42 \
|
--seed 42 \
|
||||||
--max-num-seqs $MAX_NUM_SEQS \
|
--max-num-seqs $MAX_NUM_SEQS \
|
||||||
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
||||||
|
|||||||
@ -58,33 +58,25 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
|
|||||||
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
|
||||||
|
|
||||||
if [[ $normal_wheel == *"cu126"* ]]; then
|
if [[ $normal_wheel == *"cu129"* ]]; then
|
||||||
# if $normal_wheel matches cu126, do not upload the index.html
|
|
||||||
echo "Skipping index files for cu126 wheels"
|
|
||||||
elif [[ $normal_wheel == *"cu128"* ]]; then
|
|
||||||
# if $normal_wheel matches cu128, do not upload the index.html
|
|
||||||
echo "Skipping index files for cu128 wheels"
|
|
||||||
else
|
|
||||||
# only upload index.html for cu129 wheels (default wheels) as it
|
# only upload index.html for cu129 wheels (default wheels) as it
|
||||||
# is available on both x86 and arm64
|
# is available on both x86 and arm64
|
||||||
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
|
||||||
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
|
||||||
|
else
|
||||||
|
echo "Skipping index files for non-cu129 wheels"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# generate index for nightly
|
# generate index for nightly
|
||||||
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
|
||||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
|
||||||
|
|
||||||
if [[ $normal_wheel == *"cu126"* ]]; then
|
if [[ $normal_wheel == *"cu129"* ]]; then
|
||||||
# if $normal_wheel matches cu126, do not upload the index.html
|
|
||||||
echo "Skipping index files for cu126 wheels"
|
|
||||||
elif [[ $normal_wheel == *"cu128"* ]]; then
|
|
||||||
# if $normal_wheel matches cu128, do not upload the index.html
|
|
||||||
echo "Skipping index files for cu128 wheels"
|
|
||||||
else
|
|
||||||
# only upload index.html for cu129 wheels (default wheels) as it
|
# only upload index.html for cu129 wheels (default wheels) as it
|
||||||
# is available on both x86 and arm64
|
# is available on both x86 and arm64
|
||||||
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
|
||||||
|
else
|
||||||
|
echo "Skipping index files for non-cu129 wheels"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
|
||||||
|
|||||||
1331
.buildkite/test-amd.yaml
Normal file
1331
.buildkite/test-amd.yaml
Normal file
File diff suppressed because it is too large
Load Diff
@ -38,7 +38,7 @@ steps:
|
|||||||
- label: Pytorch Nightly Dependency Override Check # 2min
|
- label: Pytorch Nightly Dependency Override Check # 2min
|
||||||
# if this test fails, it means the nightly torch version is not compatible with some
|
# if this test fails, it means the nightly torch version is not compatible with some
|
||||||
# of the dependencies. Please check the error message and add the package to whitelist
|
# of the dependencies. Please check the error message and add the package to whitelist
|
||||||
# in /vllm/tools/generate_nightly_torch_test.py
|
# in /vllm/tools/pre_commit/generate_nightly_torch_test.py
|
||||||
soft_fail: true
|
soft_fail: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- requirements/nightly_torch_test.txt
|
- requirements/nightly_torch_test.txt
|
||||||
@ -172,6 +172,8 @@ steps:
|
|||||||
- tests/v1/engine/test_engine_core_client.py
|
- tests/v1/engine/test_engine_core_client.py
|
||||||
- tests/distributed/test_symm_mem_allreduce.py
|
- tests/distributed/test_symm_mem_allreduce.py
|
||||||
commands:
|
commands:
|
||||||
|
# https://github.com/NVIDIA/nccl/issues/1838
|
||||||
|
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||||
# test with torchrun tp=2 and external_dp=2
|
# test with torchrun tp=2 and external_dp=2
|
||||||
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
- torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||||
# test with torchrun tp=2 and pp=2
|
# test with torchrun tp=2 and pp=2
|
||||||
@ -203,6 +205,24 @@ steps:
|
|||||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||||
- popd
|
- popd
|
||||||
|
|
||||||
|
- label: Distributed Tests (8 GPUs) # 4min
|
||||||
|
timeout_in_minutes: 10
|
||||||
|
gpu: h100
|
||||||
|
num_gpus: 8
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
source_file_dependencies:
|
||||||
|
- examples/offline_inference/torchrun_dp_example.py
|
||||||
|
- vllm/config/parallel.py
|
||||||
|
- vllm/distributed/
|
||||||
|
- vllm/v1/engine/llm_engine.py
|
||||||
|
- vllm/v1/executor/uniproc_executor.py
|
||||||
|
- vllm/v1/worker/gpu_worker.py
|
||||||
|
commands:
|
||||||
|
# https://github.com/NVIDIA/nccl/issues/1838
|
||||||
|
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||||
|
# test with torchrun tp=2 and dp=4 with ep
|
||||||
|
- torchrun --nproc-per-node=8 ../examples/offline_inference/torchrun_dp_example.py --tp-size=2 --pp-size=1 --dp-size=4 --enable-ep
|
||||||
|
|
||||||
- label: EPLB Algorithm Test # 5min
|
- label: EPLB Algorithm Test # 5min
|
||||||
timeout_in_minutes: 15
|
timeout_in_minutes: 15
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
@ -212,8 +232,8 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s distributed/test_eplb_algo.py
|
- pytest -v -s distributed/test_eplb_algo.py
|
||||||
|
|
||||||
- label: EPLB Execution Test # 5min
|
- label: EPLB Execution Test # 10min
|
||||||
timeout_in_minutes: 15
|
timeout_in_minutes: 20
|
||||||
working_dir: "/vllm-workspace/tests"
|
working_dir: "/vllm-workspace/tests"
|
||||||
num_gpus: 4
|
num_gpus: 4
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -221,6 +241,7 @@ steps:
|
|||||||
- tests/distributed/test_eplb_execute.py
|
- tests/distributed/test_eplb_execute.py
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s distributed/test_eplb_execute.py
|
- pytest -v -s distributed/test_eplb_execute.py
|
||||||
|
- pytest -v -s distributed/test_eplb_spec_decode.py
|
||||||
|
|
||||||
- label: Metrics, Tracing Test # 12min
|
- label: Metrics, Tracing Test # 12min
|
||||||
timeout_in_minutes: 20
|
timeout_in_minutes: 20
|
||||||
@ -295,7 +316,9 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/v1
|
- tests/v1
|
||||||
commands:
|
commands:
|
||||||
|
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||||
# split the test to avoid interference
|
# split the test to avoid interference
|
||||||
|
- pytest -v -s -m 'not cpu_test' v1/core
|
||||||
- pytest -v -s v1/executor
|
- pytest -v -s v1/executor
|
||||||
- pytest -v -s v1/kv_offload
|
- pytest -v -s v1/kv_offload
|
||||||
- pytest -v -s v1/sample
|
- pytest -v -s v1/sample
|
||||||
@ -310,6 +333,24 @@ steps:
|
|||||||
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
- pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
|
||||||
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||||
|
|
||||||
|
- label: V1 Test attention (H100) # 10min
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
gpu: h100
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/v1/attention
|
||||||
|
- tests/v1/attention
|
||||||
|
commands:
|
||||||
|
- pytest -v -s v1/attention
|
||||||
|
|
||||||
|
- label: V1 Test attention (B200) # 10min
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
gpu: b200
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/v1/attention
|
||||||
|
- tests/v1/attention
|
||||||
|
commands:
|
||||||
|
- VLLM_DISABLE_FLASHINFER_PREFILL=1 pytest -v -s v1/attention # TODO: FI prefill is bugged and causes incorrectness, fix this
|
||||||
|
|
||||||
- label: V1 Test others (CPU) # 5 mins
|
- label: V1 Test others (CPU) # 5 mins
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
@ -317,7 +358,7 @@ steps:
|
|||||||
no_gpu: true
|
no_gpu: true
|
||||||
commands:
|
commands:
|
||||||
# split the test to avoid interference
|
# split the test to avoid interference
|
||||||
- pytest -v -s v1/core
|
- pytest -v -s -m 'cpu_test' v1/core
|
||||||
- pytest -v -s v1/structured_output
|
- pytest -v -s v1/structured_output
|
||||||
- pytest -v -s v1/test_serial_utils.py
|
- pytest -v -s v1/test_serial_utils.py
|
||||||
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
|
- pytest -v -s -m 'cpu_test' v1/kv_connector/unit
|
||||||
@ -348,7 +389,8 @@ steps:
|
|||||||
- python3 offline_inference/basic/embed.py
|
- python3 offline_inference/basic/embed.py
|
||||||
- python3 offline_inference/basic/score.py
|
- python3 offline_inference/basic/score.py
|
||||||
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
- python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
||||||
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
|
# https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
|
||||||
|
- python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
|
||||||
|
|
||||||
- label: Platform Tests (CUDA) # 4min
|
- label: Platform Tests (CUDA) # 4min
|
||||||
timeout_in_minutes: 15
|
timeout_in_minutes: 15
|
||||||
@ -383,7 +425,12 @@ steps:
|
|||||||
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
|
--num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
|
||||||
--ignore=lora/test_chatglm3_tp.py \
|
--ignore=lora/test_chatglm3_tp.py \
|
||||||
--ignore=lora/test_llama_tp.py \
|
--ignore=lora/test_llama_tp.py \
|
||||||
--ignore=lora/test_llm_with_multi_loras.py
|
--ignore=lora/test_llm_with_multi_loras.py \
|
||||||
|
--ignore=lora/test_olmoe_tp.py \
|
||||||
|
--ignore=lora/test_deepseekv2_tp.py \
|
||||||
|
--ignore=lora/test_gptoss_tp.py \
|
||||||
|
--ignore=lora/test_qwen3moe_tp.py
|
||||||
|
|
||||||
parallelism: 4
|
parallelism: 4
|
||||||
|
|
||||||
- label: PyTorch Compilation Unit Tests # 15min
|
- label: PyTorch Compilation Unit Tests # 15min
|
||||||
@ -394,16 +441,16 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
|
- pytest -v -s compile/test_config.py
|
||||||
- pytest -v -s compile/test_pass_manager.py
|
- pytest -v -s compile/test_pass_manager.py
|
||||||
- pytest -v -s compile/test_fusion.py
|
- pytest -v -s compile/test_fusion.py
|
||||||
- pytest -v -s compile/test_fusion_attn.py
|
- pytest -v -s compile/test_fusion_attn.py
|
||||||
- pytest -v -s compile/test_functionalization.py
|
- pytest -v -s compile/test_functionalization.py
|
||||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
||||||
- pytest -v -s compile/test_sequence_parallelism.py
|
|
||||||
- pytest -v -s compile/test_async_tp.py
|
|
||||||
- pytest -v -s compile/test_fusion_all_reduce.py
|
- pytest -v -s compile/test_fusion_all_reduce.py
|
||||||
- pytest -v -s compile/test_decorator.py
|
- pytest -v -s compile/test_decorator.py
|
||||||
- pytest -v -s compile/test_noop_elimination.py
|
- pytest -v -s compile/test_noop_elimination.py
|
||||||
|
- pytest -v -s compile/test_aot_compile.py
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Smoke Test # 15min
|
- label: PyTorch Fullgraph Smoke Test # 15min
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 30
|
||||||
@ -414,17 +461,34 @@ steps:
|
|||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s compile/test_basic_correctness.py
|
- pytest -v -s compile/test_basic_correctness.py
|
||||||
|
- pytest -v -s compile/test_multimodal_compile.py
|
||||||
- pytest -v -s compile/piecewise/
|
- pytest -v -s compile/piecewise/
|
||||||
|
|
||||||
- label: PyTorch Fullgraph Test # 20min
|
- label: PyTorch Fullgraph Test # 22min
|
||||||
timeout_in_minutes: 30
|
timeout_in_minutes: 35
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
torch_nightly: true
|
torch_nightly: true
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- vllm/
|
- vllm/
|
||||||
- tests/compile
|
- tests/compile
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s compile/test_full_graph.py
|
# fp8 kv scales not supported on sm89, tested on Blackwell instead
|
||||||
|
- pytest -v -s compile/test_full_graph.py -k 'not test_fp8_kv_scale_compile'
|
||||||
|
# Limit to no custom ops to reduce running time
|
||||||
|
# Wrap with quotes to escape yaml and avoid starting -k string with a -
|
||||||
|
- "pytest -v -s compile/test_fusions_e2e.py -k 'TRITON and not +quant_fp8 and not Llama-4'"
|
||||||
|
|
||||||
|
- label: Cudagraph test
|
||||||
|
timeout_in_minutes: 20
|
||||||
|
mirror_hardwares: [amdexperimental]
|
||||||
|
source_file_dependencies:
|
||||||
|
- tests/v1/cudagraph
|
||||||
|
- vllm/v1/cudagraph_dispatcher.py
|
||||||
|
- vllm/config/compilation.py
|
||||||
|
- vllm/compilation
|
||||||
|
commands:
|
||||||
|
- pytest -v -s v1/cudagraph/test_cudagraph_dispatch.py
|
||||||
|
- pytest -v -s v1/cudagraph/test_cudagraph_mode.py
|
||||||
|
|
||||||
- label: Kernels Core Operation Test # 48min
|
- label: Kernels Core Operation Test # 48min
|
||||||
timeout_in_minutes: 75
|
timeout_in_minutes: 75
|
||||||
@ -432,8 +496,9 @@ steps:
|
|||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
- csrc/
|
- csrc/
|
||||||
- tests/kernels/core
|
- tests/kernels/core
|
||||||
|
- tests/kernels/test_top_k_per_row.py
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/core
|
- pytest -v -s kernels/core kernels/test_top_k_per_row.py
|
||||||
|
|
||||||
- label: Kernels Attention Test %N # 23min
|
- label: Kernels Attention Test %N # 23min
|
||||||
timeout_in_minutes: 35
|
timeout_in_minutes: 35
|
||||||
@ -467,6 +532,8 @@ steps:
|
|||||||
- tests/kernels/moe
|
- tests/kernels/moe
|
||||||
- vllm/model_executor/layers/fused_moe/
|
- vllm/model_executor/layers/fused_moe/
|
||||||
- vllm/distributed/device_communicators/
|
- vllm/distributed/device_communicators/
|
||||||
|
- vllm/envs.py
|
||||||
|
- vllm/config
|
||||||
commands:
|
commands:
|
||||||
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
parallelism: 2
|
parallelism: 2
|
||||||
@ -483,8 +550,11 @@ steps:
|
|||||||
|
|
||||||
- label: Model Executor Test # 23min
|
- label: Model Executor Test # 23min
|
||||||
timeout_in_minutes: 35
|
timeout_in_minutes: 35
|
||||||
|
torch_nightly: true
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
|
- vllm/engine/arg_utils.py
|
||||||
|
- vllm/config/model.py
|
||||||
- vllm/model_executor
|
- vllm/model_executor
|
||||||
- tests/model_executor
|
- tests/model_executor
|
||||||
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
- tests/entrypoints/openai/test_tensorizer_entrypoint.py
|
||||||
@ -526,8 +596,9 @@ steps:
|
|||||||
# since torchao nightly is only compatible with torch nightly currently
|
# since torchao nightly is only compatible with torch nightly currently
|
||||||
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
# https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
|
||||||
# we can only upgrade after this is resolved
|
# we can only upgrade after this is resolved
|
||||||
- pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
|
# TODO(jerryzh168): resolve the above comment
|
||||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/
|
- uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
|
||||||
|
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
|
||||||
|
|
||||||
- label: LM Eval Small Models # 53min
|
- label: LM Eval Small Models # 53min
|
||||||
timeout_in_minutes: 75
|
timeout_in_minutes: 75
|
||||||
@ -676,8 +747,10 @@ steps:
|
|||||||
- vllm/
|
- vllm/
|
||||||
- tests/models/language/generation
|
- tests/models/language/generation
|
||||||
commands:
|
commands:
|
||||||
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
|
# Install fast path packages for testing against transformers
|
||||||
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
|
# Note: also needed to run plamo2 model in vLLM
|
||||||
|
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
|
||||||
|
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
|
||||||
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
|
- pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)'
|
||||||
|
|
||||||
- label: Language Models Test (PPL)
|
- label: Language Models Test (PPL)
|
||||||
@ -732,6 +805,16 @@ steps:
|
|||||||
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
- pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
|
||||||
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
- cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
|
||||||
|
|
||||||
|
- label: Multi-Modal Accuracy Eval (Small Models) # 50min
|
||||||
|
timeout_in_minutes: 70
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/multimodal/
|
||||||
|
- vllm/inputs/
|
||||||
|
- vllm/v1/core/
|
||||||
|
commands:
|
||||||
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
|
||||||
|
|
||||||
- label: Multi-Modal Models Test (Extended) 1
|
- label: Multi-Modal Models Test (Extended) 1
|
||||||
mirror_hardwares: [amdexperimental]
|
mirror_hardwares: [amdexperimental]
|
||||||
optional: true
|
optional: true
|
||||||
@ -786,17 +869,17 @@ steps:
|
|||||||
optional: true
|
optional: true
|
||||||
commands:
|
commands:
|
||||||
- pip install --upgrade git+https://github.com/huggingface/transformers
|
- pip install --upgrade git+https://github.com/huggingface/transformers
|
||||||
- pytest -v -s tests/models/test_initialization.py
|
- pytest -v -s tests/models/test_initialization.py -k 'not (Gemma3 or ModernBert or Qwen2_5_VL or Qwen2_5vl or Qwen2VL or TransformersMultiModalEmbeddingModel or TransformersMultiModalForSequenceClassification or Ultravox or Phi4Multimodal or LlavaNextVideo or MiniCPMO or Lfm2Moe or PaliGemma or RobertaForSequenceClassification or Ovis2_5 or Fuyu or DeepseekOCR or KimiVL)'
|
||||||
- pytest -v -s tests/models/test_transformers.py
|
- pytest -v -s tests/models/test_transformers.py
|
||||||
- pytest -v -s tests/models/multimodal/processing/
|
# - pytest -v -s tests/models/multimodal/processing/
|
||||||
- pytest -v -s tests/models/multimodal/test_mapping.py
|
- pytest -v -s tests/models/multimodal/test_mapping.py -k 'not (Gemma3 or Qwen2VL or Qwen2_5_VL)'
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/offline_inference/basic/chat.py
|
||||||
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
# - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
|
||||||
# Whisper needs spawn method to avoid deadlock
|
# Whisper needs spawn method to avoid deadlock
|
||||||
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
|
||||||
|
|
||||||
- label: Blackwell Test # 38 min
|
- label: Blackwell Test # 21 min
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 30
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
gpu: b200
|
gpu: b200
|
||||||
# optional: true
|
# optional: true
|
||||||
@ -809,8 +892,6 @@ steps:
|
|||||||
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
|
||||||
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
- vllm/v1/attention/backends/flashinfer.py
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
- vllm/compilation/fusion.py
|
|
||||||
- vllm/compilation/fusion_attn.py
|
|
||||||
commands:
|
commands:
|
||||||
- nvidia-smi
|
- nvidia-smi
|
||||||
- python3 examples/offline_inference/basic/chat.py
|
- python3 examples/offline_inference/basic/chat.py
|
||||||
@ -827,13 +908,58 @@ steps:
|
|||||||
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
|
||||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
- pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
|
||||||
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
|
||||||
|
- pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
|
||||||
|
- pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
|
||||||
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
|
||||||
- pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
|
- pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
|
||||||
# Fusion
|
|
||||||
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
|
||||||
- pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
|
|
||||||
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
- pytest -v -s tests/kernels/moe/test_flashinfer.py
|
||||||
|
|
||||||
|
- label: Blackwell Fusion & Compile Tests # 30 min
|
||||||
|
timeout_in_minutes: 40
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
gpu: b200
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/fp4/
|
||||||
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
- vllm/compilation/
|
||||||
|
# can affect pattern matching
|
||||||
|
- vllm/model_executor/layers/layernorm.py
|
||||||
|
- vllm/model_executor/layers/activation.py
|
||||||
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
- pytest -v -s tests/compile/test_fusion_attn.py
|
||||||
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
- pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
|
||||||
|
# this runner has 2 GPUs available even though num_gpus=2 is not set
|
||||||
|
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||||
|
# Limit to Inductor partition, no custom ops, and allreduce & attn fusion to reduce running time
|
||||||
|
# Wrap with quotes to escape yaml
|
||||||
|
- "pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm -k 'True and not +quant_fp8 and not +rms_norm'"
|
||||||
|
# test_fp8_kv_scale_compile requires FlashAttention (not supported on default L4/L40)
|
||||||
|
- pytest -v -s tests/compile/test_full_graph.py::test_fp8_kv_scale_compile
|
||||||
|
|
||||||
|
- label: Blackwell Fusion E2E Tests # 30 min
|
||||||
|
timeout_in_minutes: 40
|
||||||
|
working_dir: "/vllm-workspace/"
|
||||||
|
gpu: b200
|
||||||
|
optional: true
|
||||||
|
num_gpus: 2
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/quantization/fp4/
|
||||||
|
- vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
|
||||||
|
- vllm/v1/attention/backends/flashinfer.py
|
||||||
|
- vllm/compilation/
|
||||||
|
# can affect pattern matching
|
||||||
|
- vllm/model_executor/layers/layernorm.py
|
||||||
|
- vllm/model_executor/layers/activation.py
|
||||||
|
- vllm/model_executor/layers/quantization/input_quant_fp8.py
|
||||||
|
- tests/compile/test_fusions_e2e.py
|
||||||
|
- tests/compile/test_full_graph.py
|
||||||
|
commands:
|
||||||
|
- nvidia-smi
|
||||||
|
# Run all e2e fusion tests
|
||||||
|
- pytest -v -s tests/compile/test_fusions_e2e.py
|
||||||
|
|
||||||
- label: Blackwell GPT-OSS Eval
|
- label: Blackwell GPT-OSS Eval
|
||||||
timeout_in_minutes: 60
|
timeout_in_minutes: 60
|
||||||
@ -867,7 +993,7 @@ steps:
|
|||||||
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
- pytest -s -v tests/quantization/test_blackwell_moe.py
|
||||||
|
|
||||||
- label: Blackwell LM Eval Small Models
|
- label: Blackwell LM Eval Small Models
|
||||||
timeout_in_minutes: 75
|
timeout_in_minutes: 120
|
||||||
gpu: b200
|
gpu: b200
|
||||||
optional: true # run on nightlies
|
optional: true # run on nightlies
|
||||||
source_file_dependencies:
|
source_file_dependencies:
|
||||||
@ -940,6 +1066,8 @@ steps:
|
|||||||
- tests/v1/shutdown
|
- tests/v1/shutdown
|
||||||
- tests/v1/worker/test_worker_memory_snapshot.py
|
- tests/v1/worker/test_worker_memory_snapshot.py
|
||||||
commands:
|
commands:
|
||||||
|
# https://github.com/NVIDIA/nccl/issues/1838
|
||||||
|
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
|
||||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
|
||||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||||
@ -947,6 +1075,7 @@ steps:
|
|||||||
- pytest -v -s ./compile/test_basic_correctness.py
|
- pytest -v -s ./compile/test_basic_correctness.py
|
||||||
- pytest -v -s ./compile/test_wrapper.py
|
- pytest -v -s ./compile/test_wrapper.py
|
||||||
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
|
- VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
|
||||||
- pytest -v -s distributed/test_sequence_parallel.py
|
- pytest -v -s distributed/test_sequence_parallel.py
|
||||||
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
|
||||||
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
- pytest -v -s v1/worker/test_worker_memory_snapshot.py
|
||||||
@ -990,6 +1119,11 @@ steps:
|
|||||||
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
- pytest -v -s plugins_tests/test_io_processor_plugins.py
|
||||||
- pip uninstall prithvi_io_processor_plugin -y
|
- pip uninstall prithvi_io_processor_plugin -y
|
||||||
# end io_processor plugins test
|
# end io_processor plugins test
|
||||||
|
# begin stat_logger plugins test
|
||||||
|
- pip install -e ./plugins/vllm_add_dummy_stat_logger
|
||||||
|
- pytest -v -s plugins_tests/test_stats_logger_plugins.py
|
||||||
|
- pip uninstall dummy_stat_logger -y
|
||||||
|
# end stat_logger plugins test
|
||||||
# other tests continue here:
|
# other tests continue here:
|
||||||
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
- pytest -v -s plugins_tests/test_scheduler_plugins.py
|
||||||
- pip install -e ./plugins/vllm_add_dummy_model
|
- pip install -e ./plugins/vllm_add_dummy_model
|
||||||
@ -1029,6 +1163,8 @@ steps:
|
|||||||
- pytest -v -s -x lora/test_chatglm3_tp.py
|
- pytest -v -s -x lora/test_chatglm3_tp.py
|
||||||
- pytest -v -s -x lora/test_llama_tp.py
|
- pytest -v -s -x lora/test_llama_tp.py
|
||||||
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
- pytest -v -s -x lora/test_llm_with_multi_loras.py
|
||||||
|
- pytest -v -s -x lora/test_olmoe_tp.py
|
||||||
|
- pytest -v -s -x lora/test_gptoss_tp.py
|
||||||
|
|
||||||
|
|
||||||
- label: Weight Loading Multiple GPU Test # 33min
|
- label: Weight Loading Multiple GPU Test # 33min
|
||||||
@ -1055,6 +1191,17 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
|
||||||
|
|
||||||
|
- label: NixlConnector PD accuracy tests (Distributed) # 30min
|
||||||
|
timeout_in_minutes: 30
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 4
|
||||||
|
source_file_dependencies:
|
||||||
|
- vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
|
||||||
|
- tests/v1/kv_connector/nixl_integration/
|
||||||
|
commands:
|
||||||
|
- uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
|
||||||
|
- bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
|
||||||
|
|
||||||
|
|
||||||
##### multi gpus test #####
|
##### multi gpus test #####
|
||||||
##### A100 test #####
|
##### A100 test #####
|
||||||
@ -1085,15 +1232,34 @@ steps:
|
|||||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||||
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
|
||||||
|
|
||||||
|
##### H100 test #####
|
||||||
|
- label: LM Eval Large Models (H100) # optional
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
|
||||||
|
source_file_dependencies:
|
||||||
|
- csrc/
|
||||||
|
- vllm/model_executor/layers/quantization
|
||||||
|
commands:
|
||||||
|
- export VLLM_USE_DEEP_GEMM=0 # We found Triton is faster than DeepGEMM for H100
|
||||||
|
- pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large-hopper.txt --tp-size=4
|
||||||
|
|
||||||
##### H200 test #####
|
##### H200 test #####
|
||||||
- label: Distrubted Tests (H200) # optional
|
- label: Distributed Tests (H200) # optional
|
||||||
gpu: h200
|
gpu: h200
|
||||||
optional: true
|
optional: true
|
||||||
working_dir: "/vllm-workspace/"
|
working_dir: "/vllm-workspace/"
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
commands:
|
commands:
|
||||||
|
- pytest -v -s tests/compile/test_async_tp.py
|
||||||
|
- pytest -v -s tests/compile/test_sequence_parallelism.py
|
||||||
|
- pytest -v -s tests/compile/test_fusion_all_reduce.py
|
||||||
|
- "pytest -v -s tests/compile/test_fusions_e2e.py -k 'not Llama-4'"
|
||||||
|
- pytest -v -s tests/distributed/test_sequence_parallel.py
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
- CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1 --dp-size=2 --max-model-len 2048
|
||||||
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
|
|
||||||
##### B200 test #####
|
##### B200 test #####
|
||||||
- label: Distributed Tests (B200) # optional
|
- label: Distributed Tests (B200) # optional
|
||||||
@ -1104,6 +1270,7 @@ steps:
|
|||||||
commands:
|
commands:
|
||||||
- pytest -v -s tests/distributed/test_context_parallel.py
|
- pytest -v -s tests/distributed/test_context_parallel.py
|
||||||
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
- pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
|
||||||
|
- pytest -v -s tests/v1/distributed/test_dbo.py
|
||||||
|
|
||||||
##### RL Integration Tests #####
|
##### RL Integration Tests #####
|
||||||
- label: Prime-RL Integration Test # 15min
|
- label: Prime-RL Integration Test # 15min
|
||||||
@ -1116,3 +1283,21 @@ steps:
|
|||||||
- .buildkite/scripts/run-prime-rl-test.sh
|
- .buildkite/scripts/run-prime-rl-test.sh
|
||||||
commands:
|
commands:
|
||||||
- bash .buildkite/scripts/run-prime-rl-test.sh
|
- bash .buildkite/scripts/run-prime-rl-test.sh
|
||||||
|
|
||||||
|
- label: DeepSeek V2-Lite Accuracy
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh 0.25 200 8010
|
||||||
|
|
||||||
|
- label: Qwen3-30B-A3B-FP8-block Accuracy
|
||||||
|
timeout_in_minutes: 60
|
||||||
|
gpu: h100
|
||||||
|
optional: true
|
||||||
|
num_gpus: 4
|
||||||
|
working_dir: "/vllm-workspace"
|
||||||
|
commands:
|
||||||
|
- bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep.sh 0.8 200 8020
|
||||||
|
|||||||
17
.coveragerc
17
.coveragerc
@ -1,5 +1,10 @@
|
|||||||
[run]
|
[run]
|
||||||
source = vllm
|
# Track the installed vllm package (this is what actually gets imported during tests)
|
||||||
|
# Use wildcard pattern to match the installed location
|
||||||
|
source =
|
||||||
|
vllm
|
||||||
|
*/dist-packages/vllm
|
||||||
|
*/site-packages/vllm
|
||||||
omit =
|
omit =
|
||||||
*/tests/*
|
*/tests/*
|
||||||
*/test_*
|
*/test_*
|
||||||
@ -12,6 +17,16 @@ omit =
|
|||||||
*/benchmarks/*
|
*/benchmarks/*
|
||||||
*/docs/*
|
*/docs/*
|
||||||
|
|
||||||
|
[paths]
|
||||||
|
# Map all possible vllm locations to a canonical "vllm" path
|
||||||
|
# This ensures coverage.combine properly merges data from different test runs
|
||||||
|
source =
|
||||||
|
vllm
|
||||||
|
/vllm-workspace/src/vllm
|
||||||
|
/vllm-workspace/vllm
|
||||||
|
*/site-packages/vllm
|
||||||
|
*/dist-packages/vllm
|
||||||
|
|
||||||
[report]
|
[report]
|
||||||
exclude_lines =
|
exclude_lines =
|
||||||
pragma: no cover
|
pragma: no cover
|
||||||
|
|||||||
4
.git-blame-ignore-revs
Normal file
4
.git-blame-ignore-revs
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
# Migrate from `yapf` & `isort` to `ruff`
|
||||||
|
d6953beb91da4e9c99be4c0a1304a2d24189535c
|
||||||
|
# Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y`
|
||||||
|
8fcaaf6a165e661f63fc51be906bc05b0767332f
|
||||||
49
.github/CODEOWNERS
vendored
49
.github/CODEOWNERS
vendored
@ -5,13 +5,11 @@
|
|||||||
/vllm/attention @LucasWilkinson
|
/vllm/attention @LucasWilkinson
|
||||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
||||||
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
|
/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
|
||||||
/vllm/model_executor/layers/fused_moe @mgoin
|
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
|
||||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
|
|
||||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
|
|
||||||
/vllm/model_executor/layers/mamba @tdoublep
|
/vllm/model_executor/layers/mamba @tdoublep
|
||||||
/vllm/model_executor/model_loader @22quinn
|
/vllm/model_executor/model_loader @22quinn
|
||||||
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
|
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche @tjtanaa
|
||||||
/vllm/vllm_flash_attn @LucasWilkinson
|
/vllm/vllm_flash_attn @LucasWilkinson
|
||||||
/vllm/lora @jeejeelee
|
/vllm/lora @jeejeelee
|
||||||
/vllm/reasoning @aarnphm @chaunceyjiang
|
/vllm/reasoning @aarnphm @chaunceyjiang
|
||||||
@ -26,9 +24,9 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
|
/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
|
||||||
|
|
||||||
# vLLM V1
|
# vLLM V1
|
||||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
|
||||||
/vllm/v1/attention @LucasWilkinson
|
/vllm/v1/attention @LucasWilkinson
|
||||||
/vllm/v1/attention/backends/flashinfer.py @mgoin
|
/vllm/v1/attention/backends/mla @pavanimajety
|
||||||
|
/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
|
||||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
/vllm/v1/attention/backends/triton_attn.py @tdoublep
|
||||||
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
|
||||||
/vllm/v1/sample @22quinn @houseroad @njhill
|
/vllm/v1/sample @22quinn @houseroad @njhill
|
||||||
@ -47,7 +45,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
|
/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
|
||||||
/tests/models @DarkLight1337 @ywang96
|
/tests/models @DarkLight1337 @ywang96
|
||||||
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
|
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
|
||||||
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
|
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety
|
||||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
||||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
||||||
@ -60,7 +58,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
|
|||||||
/tests/v1/offloading @ApostaC
|
/tests/v1/offloading @ApostaC
|
||||||
|
|
||||||
# Transformers backend
|
# Transformers backend
|
||||||
/vllm/model_executor/models/transformers.py @hmellor
|
/vllm/model_executor/models/transformers @hmellor
|
||||||
/tests/models/test_transformers.py @hmellor
|
/tests/models/test_transformers.py @hmellor
|
||||||
|
|
||||||
# Docs
|
# Docs
|
||||||
@ -107,11 +105,21 @@ mkdocs.yaml @hmellor
|
|||||||
/vllm/attention/ops/triton_unified_attention.py @tdoublep
|
/vllm/attention/ops/triton_unified_attention.py @tdoublep
|
||||||
|
|
||||||
# ROCm related: specify owner with write access to notify AMD folks for careful code review
|
# ROCm related: specify owner with write access to notify AMD folks for careful code review
|
||||||
/docker/Dockerfile.rocm* @gshtras
|
/vllm/**/*rocm* @tjtanaa
|
||||||
/vllm/v1/attention/backends/rocm*.py @gshtras
|
/docker/Dockerfile.rocm* @gshtras @tjtanaa
|
||||||
/vllm/v1/attention/backends/mla/rocm*.py @gshtras
|
/vllm/v1/attention/backends/rocm*.py @gshtras @tjtanaa
|
||||||
/vllm/attention/ops/rocm*.py @gshtras
|
/vllm/v1/attention/backends/mla/rocm*.py @gshtras @tjtanaa
|
||||||
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras
|
/vllm/attention/ops/rocm*.py @gshtras @tjtanaa
|
||||||
|
/vllm/model_executor/layers/fused_moe/rocm*.py @gshtras @tjtanaa
|
||||||
|
/csrc/rocm @gshtras @tjtanaa
|
||||||
|
/requirements/*rocm* @tjtanaa
|
||||||
|
/tests/**/*rocm* @tjtanaa
|
||||||
|
/docs/**/*rocm* @tjtanaa
|
||||||
|
/vllm/**/*quark* @tjtanaa
|
||||||
|
/tests/**/*quark* @tjtanaa
|
||||||
|
/docs/**/*quark* @tjtanaa
|
||||||
|
/vllm/**/*aiter* @tjtanaa
|
||||||
|
/tests/**/*aiter* @tjtanaa
|
||||||
|
|
||||||
# TPU
|
# TPU
|
||||||
/vllm/v1/worker/tpu* @NickLucche
|
/vllm/v1/worker/tpu* @NickLucche
|
||||||
@ -121,3 +129,16 @@ mkdocs.yaml @hmellor
|
|||||||
|
|
||||||
# KVConnector installation files
|
# KVConnector installation files
|
||||||
/requirements/kv_connectors.txt @NickLucche
|
/requirements/kv_connectors.txt @NickLucche
|
||||||
|
|
||||||
|
# Pooling models
|
||||||
|
/examples/*/pooling/ @noooop
|
||||||
|
/tests/models/*/pooling* @noooop
|
||||||
|
/tests/entrypoints/pooling @noooop
|
||||||
|
/vllm/config/pooler.py @noooop
|
||||||
|
/vllm/pooling_params.py @noooop
|
||||||
|
/vllm/model_executor/layers/pooler.py @noooop
|
||||||
|
|
||||||
|
# Security guide and policies
|
||||||
|
/docs/usage/security.md @russellb
|
||||||
|
/SECURITY.md @russellb
|
||||||
|
/docs/contributing/vulnerability_management.md @russellb
|
||||||
|
|||||||
4
.github/mergify.yml
vendored
4
.github/mergify.yml
vendored
@ -11,6 +11,8 @@ pull_request_rules:
|
|||||||
label:
|
label:
|
||||||
add:
|
add:
|
||||||
- documentation
|
- documentation
|
||||||
|
comment:
|
||||||
|
message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
|
||||||
|
|
||||||
- name: label-ci-build
|
- name: label-ci-build
|
||||||
description: Automatically apply ci/build label
|
description: Automatically apply ci/build label
|
||||||
@ -106,7 +108,7 @@ pull_request_rules:
|
|||||||
- files~=^benchmarks/
|
- files~=^benchmarks/
|
||||||
- files~=^vllm/benchmarks/
|
- files~=^vllm/benchmarks/
|
||||||
- files~=^tests/benchmarks/
|
- files~=^tests/benchmarks/
|
||||||
- files~=^\.buildkite/nightly-benchmarks/
|
- files~=^\.buildkite/performance-benchmarks/
|
||||||
actions:
|
actions:
|
||||||
label:
|
label:
|
||||||
add:
|
add:
|
||||||
|
|||||||
138
.github/workflows/issue_autolabel.yml
vendored
138
.github/workflows/issue_autolabel.yml
vendored
@ -13,6 +13,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- name: Label issues based on keywords
|
- name: Label issues based on keywords
|
||||||
|
id: label-step
|
||||||
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||||
with:
|
with:
|
||||||
script: |
|
script: |
|
||||||
@ -42,7 +43,6 @@ jobs:
|
|||||||
searchIn: "body"
|
searchIn: "body"
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
|
||||||
// Substring search - matches anywhere in text (partial matches)
|
// Substring search - matches anywhere in text (partial matches)
|
||||||
substrings: [
|
substrings: [
|
||||||
{
|
{
|
||||||
@ -89,14 +89,12 @@ jobs:
|
|||||||
term: "hip_",
|
term: "hip_",
|
||||||
searchIn: "both"
|
searchIn: "both"
|
||||||
},
|
},
|
||||||
|
|
||||||
// ROCm tools and libraries
|
// ROCm tools and libraries
|
||||||
{
|
{
|
||||||
term: "hipify",
|
term: "hipify",
|
||||||
searchIn: "both"
|
searchIn: "both"
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
|
||||||
// Regex patterns - for complex pattern matching
|
// Regex patterns - for complex pattern matching
|
||||||
regexPatterns: [
|
regexPatterns: [
|
||||||
{
|
{
|
||||||
@ -107,13 +105,17 @@ jobs:
|
|||||||
}
|
}
|
||||||
],
|
],
|
||||||
},
|
},
|
||||||
|
// Add more label configurations here as needed
|
||||||
|
// example: {
|
||||||
|
// keywords: [...],
|
||||||
|
// substrings: [...],
|
||||||
|
// regexPatterns: [...]
|
||||||
|
// },
|
||||||
};
|
};
|
||||||
|
|
||||||
// Helper function to create regex based on search type
|
// Helper function to create regex based on search type
|
||||||
function createSearchRegex(term, type) {
|
function createSearchRegex(term, type) {
|
||||||
// Escape special regex characters in the term
|
// Escape special regex characters in the term
|
||||||
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||||
|
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case 'keyword':
|
case 'keyword':
|
||||||
// Word boundary search - matches whole words only
|
// Word boundary search - matches whole words only
|
||||||
@ -125,16 +127,13 @@ jobs:
|
|||||||
throw new Error(`Unknown search type: ${type}`);
|
throw new Error(`Unknown search type: ${type}`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function to find matching terms in text with line information
|
// Helper function to find matching terms in text with line information
|
||||||
function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
|
function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
|
||||||
const matches = [];
|
const matches = [];
|
||||||
const lines = text.split('\n');
|
const lines = text.split('\n');
|
||||||
|
|
||||||
for (const termConfig of searchTerms) {
|
for (const termConfig of searchTerms) {
|
||||||
let regex;
|
let regex;
|
||||||
let term, searchIn, pattern, description, flags;
|
let term, searchIn, pattern, description, flags;
|
||||||
|
|
||||||
// Handle different input formats (string or object)
|
// Handle different input formats (string or object)
|
||||||
if (typeof termConfig === 'string') {
|
if (typeof termConfig === 'string') {
|
||||||
term = termConfig;
|
term = termConfig;
|
||||||
@ -146,21 +145,17 @@ jobs:
|
|||||||
description = termConfig.description;
|
description = termConfig.description;
|
||||||
flags = termConfig.flags;
|
flags = termConfig.flags;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Skip if this term shouldn't be searched in the current location
|
// Skip if this term shouldn't be searched in the current location
|
||||||
if (searchIn !== 'both' && searchIn !== searchLocation) {
|
if (searchIn !== 'both' && searchIn !== searchLocation) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Create appropriate regex
|
// Create appropriate regex
|
||||||
if (searchType === 'regex') {
|
if (searchType === 'regex') {
|
||||||
regex = new RegExp(pattern, flags || "gi");
|
regex = new RegExp(pattern, flags || "gi");
|
||||||
} else {
|
} else {
|
||||||
regex = createSearchRegex(term, searchType);
|
regex = createSearchRegex(term, searchType);
|
||||||
}
|
}
|
||||||
|
|
||||||
const termMatches = [];
|
const termMatches = [];
|
||||||
|
|
||||||
// Check each line for matches
|
// Check each line for matches
|
||||||
lines.forEach((line, lineIndex) => {
|
lines.forEach((line, lineIndex) => {
|
||||||
const lineMatches = line.match(regex);
|
const lineMatches = line.match(regex);
|
||||||
@ -175,15 +170,14 @@ jobs:
|
|||||||
originalTerm: term || pattern,
|
originalTerm: term || pattern,
|
||||||
description: description,
|
description: description,
|
||||||
// Show context around the match in the line
|
// Show context around the match in the line
|
||||||
context: line.length > 100 ?
|
context: line.length > 100 ?
|
||||||
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
|
line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),
|
||||||
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
|
line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'
|
||||||
: line.trim()
|
: line.trim()
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
if (termMatches.length > 0) {
|
if (termMatches.length > 0) {
|
||||||
matches.push({
|
matches.push({
|
||||||
term: term || (description || pattern),
|
term: term || (description || pattern),
|
||||||
@ -196,64 +190,48 @@ jobs:
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return matches;
|
return matches;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Helper function to check if label should be added
|
// Helper function to check if label should be added
|
||||||
async function processLabel(labelName, config) {
|
async function processLabel(labelName, config) {
|
||||||
const body = context.payload.issue.body || "";
|
const body = context.payload.issue.body || "";
|
||||||
const title = context.payload.issue.title || "";
|
const title = context.payload.issue.title || "";
|
||||||
|
|
||||||
core.notice(`Processing label: ${labelName}`);
|
core.notice(`Processing label: ${labelName}`);
|
||||||
core.notice(`Issue Title: "${title}"`);
|
core.notice(`Issue Title: "${title}"`);
|
||||||
core.notice(`Issue Body length: ${body.length} characters`);
|
core.notice(`Issue Body length: ${body.length} characters`);
|
||||||
|
|
||||||
let shouldAddLabel = false;
|
let shouldAddLabel = false;
|
||||||
let allMatches = [];
|
let allMatches = [];
|
||||||
let reason = '';
|
let reason = '';
|
||||||
|
|
||||||
const keywords = config.keywords || [];
|
const keywords = config.keywords || [];
|
||||||
const substrings = config.substrings || [];
|
const substrings = config.substrings || [];
|
||||||
const regexPatterns = config.regexPatterns || [];
|
const regexPatterns = config.regexPatterns || [];
|
||||||
|
|
||||||
core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
|
core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
|
||||||
|
|
||||||
// Search in title
|
// Search in title
|
||||||
if (title.trim()) {
|
if (title.trim()) {
|
||||||
core.notice(`Searching in title: "${title}"`);
|
core.notice(`Searching in title: "${title}"`);
|
||||||
|
|
||||||
const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
|
const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
|
||||||
const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
|
const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
|
||||||
const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
|
const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
|
||||||
|
|
||||||
allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
|
allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Search in body
|
// Search in body
|
||||||
if (body.trim()) {
|
if (body.trim()) {
|
||||||
core.notice(`Searching in body (${body.length} characters)`);
|
core.notice(`Searching in body (${body.length} characters)`);
|
||||||
|
|
||||||
const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
|
const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
|
||||||
const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
|
const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
|
||||||
const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
|
const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
|
||||||
|
|
||||||
allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
|
allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (allMatches.length > 0) {
|
if (allMatches.length > 0) {
|
||||||
core.notice(`Found ${allMatches.length} matching term(s):`);
|
core.notice(`Found ${allMatches.length} matching term(s):`);
|
||||||
|
|
||||||
for (const termMatch of allMatches) {
|
for (const termMatch of allMatches) {
|
||||||
const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
|
const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
|
||||||
const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
|
const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
|
||||||
|
|
||||||
if (termMatch.searchType === 'regex') {
|
if (termMatch.searchType === 'regex') {
|
||||||
core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
core.notice(` 📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
||||||
} else {
|
} else {
|
||||||
core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
core.notice(` 📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Show details for each match
|
// Show details for each match
|
||||||
termMatch.matches.forEach((match, index) => {
|
termMatch.matches.forEach((match, index) => {
|
||||||
core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
|
core.notice(` ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
|
||||||
@ -266,7 +244,6 @@ jobs:
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
shouldAddLabel = true;
|
shouldAddLabel = true;
|
||||||
const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
|
const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
|
||||||
const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
|
const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
|
||||||
@ -274,13 +251,10 @@ jobs:
|
|||||||
const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
|
const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
|
||||||
const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
|
const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
|
||||||
const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
|
const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
|
||||||
|
|
||||||
reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
|
reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
|
||||||
}
|
}
|
||||||
|
|
||||||
core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
|
core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
|
||||||
core.notice(`Reason: ${reason || 'No matching terms found'}`);
|
core.notice(`Reason: ${reason || 'No matching terms found'}`);
|
||||||
|
|
||||||
if (shouldAddLabel) {
|
if (shouldAddLabel) {
|
||||||
const existingLabels = context.payload.issue.labels.map(l => l.name);
|
const existingLabels = context.payload.issue.labels.map(l => l.name);
|
||||||
if (!existingLabels.includes(labelName)) {
|
if (!existingLabels.includes(labelName)) {
|
||||||
@ -296,14 +270,92 @@ jobs:
|
|||||||
core.notice(`Label "${labelName}" already present.`);
|
core.notice(`Label "${labelName}" already present.`);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
core.notice(`No matching terms found for label "${labelName}".`);
|
core.notice(`No matching terms found for label "${labelName}".`);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process all configured labels
|
// Process all configured labels
|
||||||
const processLabels = Object.entries(labelConfig)
|
const labelsAddedResults = await Promise.all(
|
||||||
.map(([labelName, config]) => processLabel(labelName, config));
|
Object.entries(labelConfig).map(([labelName, config]) =>
|
||||||
const labelsAdded = await Promise.all(processLabels);
|
processLabel(labelName, config).then(added => ({ labelName, added }))
|
||||||
const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
|
)
|
||||||
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
|
);
|
||||||
|
|
||||||
|
const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
|
||||||
|
core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
|
||||||
|
|
||||||
|
// Return which labels were added for the next step
|
||||||
|
const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
|
||||||
|
core.setOutput('labels_added', JSON.stringify(addedLabels));
|
||||||
|
return addedLabels;
|
||||||
|
|
||||||
|
- name: CC users for labeled issues
|
||||||
|
if: steps.label-step.outputs.labels_added != '[]'
|
||||||
|
uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd # v8.0.0
|
||||||
|
with:
|
||||||
|
script: |
|
||||||
|
// Configuration: Map labels to GitHub users to CC
|
||||||
|
// You can add multiple users per label, and multiple label configurations
|
||||||
|
const ccConfig = {
|
||||||
|
rocm: {
|
||||||
|
users: ['hongxiayang', 'tjtanaa', 'vllmellm'], // Add more users as needed: ['user1', 'user2', 'user3']
|
||||||
|
message: 'CC {users} for ROCm-related issue' // {users} will be replaced with @mentions
|
||||||
|
},
|
||||||
|
// Add more label -> user mappings here
|
||||||
|
// Example:
|
||||||
|
// cuda: {
|
||||||
|
// users: ['user1', 'user2'],
|
||||||
|
// message: 'CC {users} for CUDA-related issue'
|
||||||
|
// },
|
||||||
|
// performance: {
|
||||||
|
// users: ['perfexpert'],
|
||||||
|
// message: 'CC {users} for performance issue'
|
||||||
|
// },
|
||||||
|
};
|
||||||
|
|
||||||
|
const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
|
||||||
|
core.notice(`Labels added: ${labelsAdded.join(', ')}`);
|
||||||
|
|
||||||
|
// Get existing comments to check for already mentioned users
|
||||||
|
const comments = await github.rest.issues.listComments({
|
||||||
|
owner: context.repo.owner,
|
||||||
|
repo: context.repo.repo,
|
||||||
|
issue_number: context.issue.number,
|
||||||
|
});
|
||||||
|
|
||||||
|
const issueBody = context.payload.issue.body || '';
|
||||||
|
const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
|
||||||
|
|
||||||
|
// Process each label that was added
|
||||||
|
for (const label of labelsAdded) {
|
||||||
|
if (ccConfig[label]) {
|
||||||
|
const config = ccConfig[label];
|
||||||
|
const usersToMention = [];
|
||||||
|
|
||||||
|
// Check which users haven't been mentioned yet
|
||||||
|
for (const user of config.users) {
|
||||||
|
const mentionPattern = new RegExp(`@${user}\\b`, 'i');
|
||||||
|
if (!mentionPattern.test(allExistingText)) {
|
||||||
|
usersToMention.push(user);
|
||||||
|
} else {
|
||||||
|
core.notice(`@${user} already mentioned for label "${label}", skipping`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Post comment if there are users to mention
|
||||||
|
if (usersToMention.length > 0) {
|
||||||
|
const mentions = usersToMention.map(u => `@${u}`).join(' ');
|
||||||
|
const message = config.message.replace('{users}', mentions);
|
||||||
|
|
||||||
|
await github.rest.issues.createComment({
|
||||||
|
owner: context.repo.owner,
|
||||||
|
repo: context.repo.repo,
|
||||||
|
issue_number: context.issue.number,
|
||||||
|
body: message
|
||||||
|
});
|
||||||
|
|
||||||
|
core.notice(`CC comment added for label "${label}": ${mentions}`);
|
||||||
|
} else {
|
||||||
|
core.notice(`All users for label "${label}" already mentioned, skipping comment`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
6
.gitignore
vendored
6
.gitignore
vendored
@ -94,6 +94,9 @@ ipython_config.py
|
|||||||
# generated files
|
# generated files
|
||||||
**/generated/**
|
**/generated/**
|
||||||
|
|
||||||
|
# uv
|
||||||
|
uv.lock
|
||||||
|
|
||||||
# pyenv
|
# pyenv
|
||||||
# For a library or package, you might want to ignore these files since the code is
|
# For a library or package, you might want to ignore these files since the code is
|
||||||
# intended to run in multiple environments; otherwise, check them in:
|
# intended to run in multiple environments; otherwise, check them in:
|
||||||
@ -218,3 +221,6 @@ csrc/moe/marlin_moe_wna16/kernel_*
|
|||||||
|
|
||||||
# Ignore ep_kernels_workspace folder
|
# Ignore ep_kernels_workspace folder
|
||||||
ep_kernels_workspace/
|
ep_kernels_workspace/
|
||||||
|
|
||||||
|
# Allow tracked library source folders under submodules (e.g., benchmarks/lib)
|
||||||
|
!vllm/benchmarks/lib/
|
||||||
|
|||||||
@ -4,7 +4,6 @@ MD013: false
|
|||||||
MD024:
|
MD024:
|
||||||
siblings_only: true
|
siblings_only: true
|
||||||
MD033: false
|
MD033: false
|
||||||
MD042: false
|
|
||||||
MD045: false
|
MD045: false
|
||||||
MD046: false
|
MD046: false
|
||||||
MD051: false
|
MD051: false
|
||||||
|
|||||||
@ -7,17 +7,18 @@ default_stages:
|
|||||||
exclude: 'vllm/third_party/.*'
|
exclude: 'vllm/third_party/.*'
|
||||||
repos:
|
repos:
|
||||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
rev: v0.13.3
|
rev: v0.14.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: ruff-check
|
- id: ruff-check
|
||||||
args: [--output-format, github, --fix]
|
args: [--output-format, github, --fix]
|
||||||
- id: ruff-format
|
- id: ruff-format
|
||||||
- repo: https://github.com/crate-ci/typos
|
- repo: https://github.com/crate-ci/typos
|
||||||
rev: v1.35.5
|
rev: v1.38.1
|
||||||
hooks:
|
hooks:
|
||||||
- id: typos
|
- id: typos
|
||||||
|
args: [--force-exclude]
|
||||||
- repo: https://github.com/pre-commit/mirrors-clang-format
|
- repo: https://github.com/pre-commit/mirrors-clang-format
|
||||||
rev: v20.1.3
|
rev: v21.1.2
|
||||||
hooks:
|
hooks:
|
||||||
- id: clang-format
|
- id: clang-format
|
||||||
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
|
||||||
@ -34,32 +35,27 @@ repos:
|
|||||||
hooks:
|
hooks:
|
||||||
- id: actionlint
|
- id: actionlint
|
||||||
- repo: https://github.com/astral-sh/uv-pre-commit
|
- repo: https://github.com/astral-sh/uv-pre-commit
|
||||||
rev: 0.6.17
|
rev: 0.9.1
|
||||||
hooks:
|
hooks:
|
||||||
- id: pip-compile
|
- id: pip-compile
|
||||||
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28]
|
args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28, --python-version, "3.12"]
|
||||||
files: ^requirements/test\.(in|txt)$
|
files: ^requirements/test\.(in|txt)$
|
||||||
- repo: local
|
- repo: local
|
||||||
hooks:
|
hooks:
|
||||||
- id: format-torch-nightly-test
|
- id: format-torch-nightly-test
|
||||||
name: reformat nightly_torch_test.txt to be in sync with test.in
|
name: reformat nightly_torch_test.txt to be in sync with test.in
|
||||||
language: python
|
language: python
|
||||||
entry: python tools/generate_nightly_torch_test.py
|
entry: python tools/pre_commit/generate_nightly_torch_test.py
|
||||||
files: ^requirements/test\.(in|txt)$
|
files: ^requirements/test\.(in|txt)$
|
||||||
- id: mypy-local
|
- id: mypy-local
|
||||||
name: Run mypy for local Python installation
|
name: Run mypy locally for lowest supported Python version
|
||||||
entry: python tools/pre_commit/mypy.py 0 "local"
|
entry: python tools/pre_commit/mypy.py 0 "3.10"
|
||||||
stages: [pre-commit] # Don't run in CI
|
stages: [pre-commit] # Don't run in CI
|
||||||
<<: &mypy_common
|
<<: &mypy_common
|
||||||
language: python
|
language: python
|
||||||
types_or: [python, pyi]
|
types_or: [python, pyi]
|
||||||
require_serial: true
|
require_serial: true
|
||||||
additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
|
additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
|
||||||
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
|
||||||
name: Run mypy for Python 3.9
|
|
||||||
entry: python tools/pre_commit/mypy.py 1 "3.9"
|
|
||||||
<<: *mypy_common
|
|
||||||
stages: [manual] # Only run in CI
|
|
||||||
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
- id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
name: Run mypy for Python 3.10
|
name: Run mypy for Python 3.10
|
||||||
entry: python tools/pre_commit/mypy.py 1 "3.10"
|
entry: python tools/pre_commit/mypy.py 1 "3.10"
|
||||||
@ -75,14 +71,19 @@ repos:
|
|||||||
entry: python tools/pre_commit/mypy.py 1 "3.12"
|
entry: python tools/pre_commit/mypy.py 1 "3.12"
|
||||||
<<: *mypy_common
|
<<: *mypy_common
|
||||||
stages: [manual] # Only run in CI
|
stages: [manual] # Only run in CI
|
||||||
|
- id: mypy-3.13 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||||
|
name: Run mypy for Python 3.13
|
||||||
|
entry: python tools/pre_commit/mypy.py 1 "3.13"
|
||||||
|
<<: *mypy_common
|
||||||
|
stages: [manual] # Only run in CI
|
||||||
- id: shellcheck
|
- id: shellcheck
|
||||||
name: Lint shell scripts
|
name: Lint shell scripts
|
||||||
entry: tools/shellcheck.sh
|
entry: tools/pre_commit/shellcheck.sh
|
||||||
language: script
|
language: script
|
||||||
types: [shell]
|
types: [shell]
|
||||||
- id: png-lint
|
- id: png-lint
|
||||||
name: Lint PNG exports from excalidraw
|
name: Lint PNG exports from excalidraw
|
||||||
entry: tools/png-lint.sh
|
entry: tools/pre_commit/png-lint.sh
|
||||||
language: script
|
language: script
|
||||||
types: [png]
|
types: [png]
|
||||||
- id: signoff-commit
|
- id: signoff-commit
|
||||||
@ -99,12 +100,12 @@ repos:
|
|||||||
stages: [commit-msg]
|
stages: [commit-msg]
|
||||||
- id: check-spdx-header
|
- id: check-spdx-header
|
||||||
name: Check SPDX headers
|
name: Check SPDX headers
|
||||||
entry: python tools/check_spdx_header.py
|
entry: python tools/pre_commit/check_spdx_header.py
|
||||||
language: python
|
language: python
|
||||||
types: [python]
|
types: [python]
|
||||||
- id: check-root-lazy-imports
|
- id: check-root-lazy-imports
|
||||||
name: Check root lazy imports
|
name: Check root lazy imports
|
||||||
entry: python tools/check_init_lazy_imports.py
|
entry: python tools/pre_commit/check_init_lazy_imports.py
|
||||||
language: python
|
language: python
|
||||||
types: [python]
|
types: [python]
|
||||||
- id: check-filenames
|
- id: check-filenames
|
||||||
@ -118,11 +119,11 @@ repos:
|
|||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
- id: update-dockerfile-graph
|
- id: update-dockerfile-graph
|
||||||
name: Update Dockerfile dependency graph
|
name: Update Dockerfile dependency graph
|
||||||
entry: tools/update-dockerfile-graph.sh
|
entry: tools/pre_commit/update-dockerfile-graph.sh
|
||||||
language: script
|
language: script
|
||||||
- id: enforce-import-regex-instead-of-re
|
- id: enforce-import-regex-instead-of-re
|
||||||
name: Enforce import regex as re
|
name: Enforce import regex as re
|
||||||
entry: python tools/enforce_regex_import.py
|
entry: python tools/pre_commit/enforce_regex_import.py
|
||||||
language: python
|
language: python
|
||||||
types: [python]
|
types: [python]
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
@ -130,7 +131,7 @@ repos:
|
|||||||
# forbid directly import triton
|
# forbid directly import triton
|
||||||
- id: forbid-direct-triton-import
|
- id: forbid-direct-triton-import
|
||||||
name: "Forbid direct 'import triton'"
|
name: "Forbid direct 'import triton'"
|
||||||
entry: python tools/check_triton_import.py
|
entry: python tools/pre_commit/check_triton_import.py
|
||||||
language: python
|
language: python
|
||||||
types: [python]
|
types: [python]
|
||||||
pass_filenames: false
|
pass_filenames: false
|
||||||
@ -143,7 +144,7 @@ repos:
|
|||||||
additional_dependencies: [regex]
|
additional_dependencies: [regex]
|
||||||
- id: validate-config
|
- id: validate-config
|
||||||
name: Validate configuration has default values and that each field has a docstring
|
name: Validate configuration has default values and that each field has a docstring
|
||||||
entry: python tools/validate_config.py
|
entry: python tools/pre_commit/validate_config.py
|
||||||
language: python
|
language: python
|
||||||
additional_dependencies: [regex]
|
additional_dependencies: [regex]
|
||||||
# Keep `suggestion` last
|
# Keep `suggestion` last
|
||||||
|
|||||||
@ -34,7 +34,7 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
|||||||
# Supported python versions. These versions will be searched in order, the
|
# Supported python versions. These versions will be searched in order, the
|
||||||
# first match will be selected. These should be kept in sync with setup.py.
|
# first match will be selected. These should be kept in sync with setup.py.
|
||||||
#
|
#
|
||||||
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
|
set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
|
||||||
|
|
||||||
# Supported AMD GPU architectures.
|
# Supported AMD GPU architectures.
|
||||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
||||||
@ -49,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
|
|||||||
# requirements.txt files and should be kept consistent. The ROCm torch
|
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||||
# versions are derived from docker/Dockerfile.rocm
|
# versions are derived from docker/Dockerfile.rocm
|
||||||
#
|
#
|
||||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
|
set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0")
|
||||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
|
set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0")
|
||||||
|
|
||||||
#
|
#
|
||||||
# Try to find python package with an executable that exactly matches
|
# Try to find python package with an executable that exactly matches
|
||||||
@ -241,7 +241,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
message(STATUS "Enabling cumem allocator extension.")
|
message(STATUS "Enabling cumem allocator extension.")
|
||||||
# link against cuda driver library
|
# link against cuda driver library
|
||||||
list(APPEND CUMEM_LIBS CUDA::cuda_driver)
|
list(APPEND CUMEM_LIBS CUDA::cuda_driver)
|
||||||
define_gpu_extension_target(
|
define_extension_target(
|
||||||
cumem_allocator
|
cumem_allocator
|
||||||
DESTINATION vllm
|
DESTINATION vllm
|
||||||
LANGUAGE CXX
|
LANGUAGE CXX
|
||||||
@ -269,8 +269,8 @@ set(VLLM_EXT_SRC
|
|||||||
"csrc/sampler.cu"
|
"csrc/sampler.cu"
|
||||||
"csrc/cuda_view.cu"
|
"csrc/cuda_view.cu"
|
||||||
"csrc/quantization/gptq/q_gemm.cu"
|
"csrc/quantization/gptq/q_gemm.cu"
|
||||||
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
"csrc/quantization/w8a8/int8/scaled_quant.cu"
|
||||||
"csrc/quantization/fp8/common.cu"
|
"csrc/quantization/w8a8/fp8/common.cu"
|
||||||
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
"csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
|
||||||
"csrc/quantization/gguf/gguf_kernel.cu"
|
"csrc/quantization/gguf/gguf_kernel.cu"
|
||||||
"csrc/quantization/activation_kernels.cu"
|
"csrc/quantization/activation_kernels.cu"
|
||||||
@ -314,12 +314,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
list(APPEND VLLM_EXT_SRC
|
list(APPEND VLLM_EXT_SRC
|
||||||
"csrc/quantization/awq/gemm_kernels.cu"
|
"csrc/quantization/awq/gemm_kernels.cu"
|
||||||
"csrc/permute_cols.cu"
|
"csrc/permute_cols.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
|
"csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
|
||||||
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
|
||||||
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
|
||||||
"csrc/cutlass_extensions/common.cpp"
|
"csrc/cutlass_extensions/common.cpp"
|
||||||
"csrc/quantization/fp8/per_token_group_quant.cu")
|
"csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
|
||||||
|
"csrc/quantization/w8a8/int8/per_token_group_quant.cu")
|
||||||
|
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${VLLM_EXT_SRC}"
|
SRCS "${VLLM_EXT_SRC}"
|
||||||
@ -423,11 +424,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
|
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
@ -458,9 +459,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
|
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu"
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu"
|
||||||
)
|
)
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
@ -492,9 +493,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS
|
set(SRCS
|
||||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
"csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu"
|
||||||
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
|
"csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu"
|
||||||
)
|
)
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
@ -525,7 +526,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# subtract out the archs that are already built for 3x
|
# subtract out the archs that are already built for 3x
|
||||||
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
||||||
if (SCALED_MM_2X_ARCHS)
|
if (SCALED_MM_2X_ARCHS)
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
|
set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
|
||||||
@ -648,7 +649,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
# if it's possible to compile MoE kernels that use its output.
|
# if it's possible to compile MoE kernels that use its output.
|
||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu")
|
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
@ -672,7 +673,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
|
||||||
endif()
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
|
set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
@ -697,7 +698,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||||
endif()
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
|
set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
|
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
|
||||||
@ -720,7 +721,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
|
||||||
endif()
|
endif()
|
||||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
|
||||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
|
set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
|
||||||
set_gencode_flags_for_srcs(
|
set_gencode_flags_for_srcs(
|
||||||
SRCS "${SRCS}"
|
SRCS "${SRCS}"
|
||||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||||
@ -857,7 +858,7 @@ if (VLLM_GPU_LANG STREQUAL "HIP")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
message(STATUS "Enabling C extension.")
|
message(STATUS "Enabling C extension.")
|
||||||
define_gpu_extension_target(
|
define_extension_target(
|
||||||
_C
|
_C
|
||||||
DESTINATION vllm
|
DESTINATION vllm
|
||||||
LANGUAGE ${VLLM_GPU_LANG}
|
LANGUAGE ${VLLM_GPU_LANG}
|
||||||
@ -882,6 +883,7 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
|
|||||||
set(VLLM_MOE_EXT_SRC
|
set(VLLM_MOE_EXT_SRC
|
||||||
"csrc/moe/torch_bindings.cpp"
|
"csrc/moe/torch_bindings.cpp"
|
||||||
"csrc/moe/moe_align_sum_kernels.cu"
|
"csrc/moe/moe_align_sum_kernels.cu"
|
||||||
|
"csrc/moe/moe_lora_align_sum_kernels.cu"
|
||||||
"csrc/moe/topk_softmax_kernels.cu")
|
"csrc/moe/topk_softmax_kernels.cu")
|
||||||
|
|
||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
@ -971,7 +973,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
message(STATUS "Enabling moe extension.")
|
message(STATUS "Enabling moe extension.")
|
||||||
define_gpu_extension_target(
|
define_extension_target(
|
||||||
_moe_C
|
_moe_C
|
||||||
DESTINATION vllm
|
DESTINATION vllm
|
||||||
LANGUAGE ${VLLM_GPU_LANG}
|
LANGUAGE ${VLLM_GPU_LANG}
|
||||||
@ -992,7 +994,7 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
|
|||||||
"csrc/rocm/skinny_gemms.cu"
|
"csrc/rocm/skinny_gemms.cu"
|
||||||
"csrc/rocm/attention.cu")
|
"csrc/rocm/attention.cu")
|
||||||
|
|
||||||
define_gpu_extension_target(
|
define_extension_target(
|
||||||
_rocm_C
|
_rocm_C
|
||||||
DESTINATION vllm
|
DESTINATION vllm
|
||||||
LANGUAGE ${VLLM_GPU_LANG}
|
LANGUAGE ${VLLM_GPU_LANG}
|
||||||
@ -1006,6 +1008,7 @@ endif()
|
|||||||
# For CUDA we also build and ship some external projects.
|
# For CUDA we also build and ship some external projects.
|
||||||
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
include(cmake/external_projects/flashmla.cmake)
|
include(cmake/external_projects/flashmla.cmake)
|
||||||
|
include(cmake/external_projects/qutlass.cmake)
|
||||||
|
|
||||||
# vllm-flash-attn should be last as it overwrites some CMake functions
|
# vllm-flash-attn should be last as it overwrites some CMake functions
|
||||||
include(cmake/external_projects/vllm_flash_attn.cmake)
|
include(cmake/external_projects/vllm_flash_attn.cmake)
|
||||||
|
|||||||
@ -21,6 +21,8 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
|
|||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
|
|
||||||
|
- [2025/11] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/xSrYXjNgr1HbCP4ExYNG1w) focusing on distributed inference and diverse accelerator support with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1nQJ8ZkLSjKxvu36sSHaceVXtttbLvvu-?usp=drive_link).
|
||||||
|
- [2025/10] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/__xb4OyOsImz-9eAVrdlcg) focused on hands-on vLLM inference optimization! Please find the meetup slides [here](https://drive.google.com/drive/folders/1KqwjsFJLfEsC8wlDugnrR61zsWHt94Q6).
|
||||||
- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
|
- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
|
||||||
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
|
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
|
||||||
- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
|
- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
|
||||||
@ -82,7 +84,7 @@ vLLM is flexible and easy to use with:
|
|||||||
- Tensor, pipeline, data and expert parallelism support for distributed inference
|
- Tensor, pipeline, data and expert parallelism support for distributed inference
|
||||||
- Streaming outputs
|
- Streaming outputs
|
||||||
- OpenAI-compatible API server
|
- OpenAI-compatible API server
|
||||||
- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
|
- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, Arm CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
|
||||||
- Prefix caching support
|
- Prefix caching support
|
||||||
- Multi-LoRA support
|
- Multi-LoRA support
|
||||||
|
|
||||||
@ -149,6 +151,7 @@ Compute Resources:
|
|||||||
- Trainy
|
- Trainy
|
||||||
- UC Berkeley
|
- UC Berkeley
|
||||||
- UC San Diego
|
- UC San Diego
|
||||||
|
- Volcengine
|
||||||
|
|
||||||
Slack Sponsor: Anyscale
|
Slack Sponsor: Anyscale
|
||||||
|
|
||||||
|
|||||||
@ -74,7 +74,7 @@ start_server() {
|
|||||||
local vllm_log=$4
|
local vllm_log=$4
|
||||||
local profile_dir=$5
|
local profile_dir=$5
|
||||||
|
|
||||||
pkill -if vllm
|
pkill -if "vllm serve" || true
|
||||||
|
|
||||||
# Define the common arguments as a bash array.
|
# Define the common arguments as a bash array.
|
||||||
# Each argument and its value are separate elements.
|
# Each argument and its value are separate elements.
|
||||||
@ -96,11 +96,11 @@ start_server() {
|
|||||||
# This correctly passes each element as a separate argument.
|
# This correctly passes each element as a separate argument.
|
||||||
if [[ -n "$profile_dir" ]]; then
|
if [[ -n "$profile_dir" ]]; then
|
||||||
# Start server with profiling enabled
|
# Start server with profiling enabled
|
||||||
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
|
VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
|
||||||
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||||
else
|
else
|
||||||
# Start server without profiling
|
# Start server without profiling
|
||||||
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
|
VLLM_SERVER_DEV_MODE=1 \
|
||||||
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
|
||||||
fi
|
fi
|
||||||
local server_pid=$!
|
local server_pid=$!
|
||||||
@ -139,7 +139,7 @@ run_benchmark() {
|
|||||||
echo "vllm_log: $vllm_log"
|
echo "vllm_log: $vllm_log"
|
||||||
echo
|
echo
|
||||||
rm -f $vllm_log
|
rm -f $vllm_log
|
||||||
pkill -if vllm
|
pkill -if "vllm serve" || true
|
||||||
|
|
||||||
echo "starting server..."
|
echo "starting server..."
|
||||||
# Call start_server without a profile_dir to avoid profiling overhead
|
# Call start_server without a profile_dir to avoid profiling overhead
|
||||||
@ -232,7 +232,7 @@ run_benchmark() {
|
|||||||
|
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
||||||
|
|
||||||
pkill -if vllm
|
pkill -if "vllm serve" || true
|
||||||
sleep 10
|
sleep 10
|
||||||
echo "===================="
|
echo "===================="
|
||||||
return 0
|
return 0
|
||||||
@ -308,6 +308,6 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
|
|||||||
else
|
else
|
||||||
echo "No configuration met the latency requirements. Skipping final profiling run."
|
echo "No configuration met the latency requirements. Skipping final profiling run."
|
||||||
fi
|
fi
|
||||||
pkill -if vllm
|
pkill -if "vllm serve" || true
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
|
||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
|
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
|
||||||
|
|||||||
@ -8,7 +8,6 @@ import sys
|
|||||||
import time
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Optional, Union
|
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import huggingface_hub.constants
|
import huggingface_hub.constants
|
||||||
@ -28,13 +27,13 @@ class RequestFuncInput:
|
|||||||
prompt_len: int
|
prompt_len: int
|
||||||
output_len: int
|
output_len: int
|
||||||
model: str
|
model: str
|
||||||
model_name: Optional[str] = None
|
model_name: str | None = None
|
||||||
logprobs: Optional[int] = None
|
logprobs: int | None = None
|
||||||
extra_body: Optional[dict] = None
|
extra_body: dict | None = None
|
||||||
multi_modal_content: Optional[dict | list[dict]] = None
|
multi_modal_content: dict | list[dict] | None = None
|
||||||
ignore_eos: bool = False
|
ignore_eos: bool = False
|
||||||
language: Optional[str] = None
|
language: str | None = None
|
||||||
request_id: Optional[str] = None
|
request_id: str | None = None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
@ -52,7 +51,7 @@ class RequestFuncOutput:
|
|||||||
|
|
||||||
async def async_request_tgi(
|
async def async_request_tgi(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: tqdm | None = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith("generate_stream")
|
assert api_url.endswith("generate_stream")
|
||||||
@ -133,7 +132,7 @@ async def async_request_tgi(
|
|||||||
|
|
||||||
async def async_request_trt_llm(
|
async def async_request_trt_llm(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: tqdm | None = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith("generate_stream")
|
assert api_url.endswith("generate_stream")
|
||||||
@ -204,7 +203,7 @@ async def async_request_trt_llm(
|
|||||||
|
|
||||||
async def async_request_deepspeed_mii(
|
async def async_request_deepspeed_mii(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: tqdm | None = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(("completions", "profile")), (
|
assert api_url.endswith(("completions", "profile")), (
|
||||||
@ -267,7 +266,7 @@ async def async_request_deepspeed_mii(
|
|||||||
|
|
||||||
async def async_request_openai_completions(
|
async def async_request_openai_completions(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: tqdm | None = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(("completions", "profile")), (
|
assert api_url.endswith(("completions", "profile")), (
|
||||||
@ -367,7 +366,7 @@ async def async_request_openai_completions(
|
|||||||
|
|
||||||
async def async_request_openai_chat_completions(
|
async def async_request_openai_chat_completions(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: tqdm | None = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
api_url = request_func_input.api_url
|
api_url = request_func_input.api_url
|
||||||
assert api_url.endswith(("chat/completions", "profile")), (
|
assert api_url.endswith(("chat/completions", "profile")), (
|
||||||
@ -476,7 +475,7 @@ async def async_request_openai_chat_completions(
|
|||||||
|
|
||||||
async def async_request_openai_audio(
|
async def async_request_openai_audio(
|
||||||
request_func_input: RequestFuncInput,
|
request_func_input: RequestFuncInput,
|
||||||
pbar: Optional[tqdm] = None,
|
pbar: tqdm | None = None,
|
||||||
) -> RequestFuncOutput:
|
) -> RequestFuncOutput:
|
||||||
# Lazy import without PlaceholderModule to avoid vllm dep.
|
# Lazy import without PlaceholderModule to avoid vllm dep.
|
||||||
import soundfile
|
import soundfile
|
||||||
@ -610,7 +609,7 @@ def get_tokenizer(
|
|||||||
tokenizer_mode: str = "auto",
|
tokenizer_mode: str = "auto",
|
||||||
trust_remote_code: bool = False,
|
trust_remote_code: bool = False,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
|
||||||
if pretrained_model_name_or_path is not None and not os.path.exists(
|
if pretrained_model_name_or_path is not None and not os.path.exists(
|
||||||
pretrained_model_name_or_path
|
pretrained_model_name_or_path
|
||||||
):
|
):
|
||||||
|
|||||||
@ -5,7 +5,7 @@ import gc
|
|||||||
from benchmark_utils import TimeCollector
|
from benchmark_utils import TimeCollector
|
||||||
from tabulate import tabulate
|
from tabulate import tabulate
|
||||||
|
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
from vllm.v1.core.block_pool import BlockPool
|
from vllm.v1.core.block_pool import BlockPool
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -46,7 +46,7 @@ import time
|
|||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
|
def test_long_document_qa(llm=None, sampling_params=None, prompts=None):
|
||||||
|
|||||||
@ -19,7 +19,7 @@ from vllm.config import (
|
|||||||
VllmConfig,
|
VllmConfig,
|
||||||
)
|
)
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
|
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
|
||||||
from vllm.v1.worker.gpu_input_batch import InputBatch
|
from vllm.v1.worker.gpu_input_batch import InputBatch
|
||||||
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
|
from vllm.v1.worker.gpu_model_runner import GPUModelRunner
|
||||||
|
|||||||
@ -32,13 +32,12 @@ import dataclasses
|
|||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
@ -80,7 +79,7 @@ def sample_requests_from_dataset(
|
|||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
input_length_range: tuple[int, int],
|
input_length_range: tuple[int, int],
|
||||||
fixed_output_len: Optional[int],
|
fixed_output_len: int | None,
|
||||||
) -> list[Request]:
|
) -> list[Request]:
|
||||||
if fixed_output_len is not None and fixed_output_len < 4:
|
if fixed_output_len is not None and fixed_output_len < 4:
|
||||||
raise ValueError("output_len too small")
|
raise ValueError("output_len too small")
|
||||||
@ -128,7 +127,7 @@ def sample_requests_from_random(
|
|||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
input_length_range: tuple[int, int],
|
input_length_range: tuple[int, int],
|
||||||
fixed_output_len: Optional[int],
|
fixed_output_len: int | None,
|
||||||
prefix_len: int,
|
prefix_len: int,
|
||||||
) -> list[Request]:
|
) -> list[Request]:
|
||||||
requests = []
|
requests = []
|
||||||
|
|||||||
@ -7,12 +7,11 @@ import dataclasses
|
|||||||
import json
|
import json
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||||
|
|
||||||
from vllm.engine.arg_utils import EngineArgs
|
from vllm.engine.arg_utils import EngineArgs
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
|
|
||||||
# Select a equi-probable random priority
|
# Select a equi-probable random priority
|
||||||
@ -24,7 +23,7 @@ def sample_requests(
|
|||||||
dataset_path: str,
|
dataset_path: str,
|
||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
fixed_output_len: Optional[int],
|
fixed_output_len: int | None,
|
||||||
) -> list[tuple[str, int, int, int]]:
|
) -> list[tuple[str, int, int, int]]:
|
||||||
if fixed_output_len is not None and fixed_output_len < 4:
|
if fixed_output_len is not None and fixed_output_len < 4:
|
||||||
raise ValueError("output_len too small")
|
raise ValueError("output_len too small")
|
||||||
|
|||||||
@ -31,8 +31,8 @@ import time
|
|||||||
import uuid
|
import uuid
|
||||||
import warnings
|
import warnings
|
||||||
from collections.abc import AsyncGenerator
|
from collections.abc import AsyncGenerator
|
||||||
|
from contextlib import nullcontext
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import datasets
|
import datasets
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@ -51,7 +51,7 @@ except ImportError:
|
|||||||
from backend_request_func import get_tokenizer
|
from backend_request_func import get_tokenizer
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from argparse import ArgumentParser as FlexibleArgumentParser
|
from argparse import ArgumentParser as FlexibleArgumentParser
|
||||||
|
|
||||||
@ -316,7 +316,7 @@ def calculate_metrics(
|
|||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
selected_percentile_metrics: list[str],
|
selected_percentile_metrics: list[str],
|
||||||
selected_percentiles: list[float],
|
selected_percentiles: list[float],
|
||||||
goodput_config_dict: Optional[dict[str, float]] = None,
|
goodput_config_dict: dict[str, float] | None = None,
|
||||||
) -> tuple[BenchmarkMetrics, list[int]]:
|
) -> tuple[BenchmarkMetrics, list[int]]:
|
||||||
actual_output_lens: list[int] = []
|
actual_output_lens: list[int] = []
|
||||||
total_input = 0
|
total_input = 0
|
||||||
@ -436,9 +436,9 @@ async def benchmark(
|
|||||||
selected_percentile_metrics: list[str],
|
selected_percentile_metrics: list[str],
|
||||||
selected_percentiles: list[str],
|
selected_percentiles: list[str],
|
||||||
ignore_eos: bool,
|
ignore_eos: bool,
|
||||||
max_concurrency: Optional[int],
|
max_concurrency: int | None,
|
||||||
structured_output_ratio: float,
|
structured_output_ratio: float,
|
||||||
goodput_config_dict: Optional[dict[str, float]] = None,
|
goodput_config_dict: dict[str, float] | None = None,
|
||||||
):
|
):
|
||||||
if backend in ASYNC_REQUEST_FUNCS:
|
if backend in ASYNC_REQUEST_FUNCS:
|
||||||
request_func = ASYNC_REQUEST_FUNCS[backend]
|
request_func = ASYNC_REQUEST_FUNCS[backend]
|
||||||
@ -502,15 +502,9 @@ async def benchmark(
|
|||||||
|
|
||||||
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
||||||
|
|
||||||
# This can be used once the minimum Python version is 3.10 or higher,
|
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext()
|
||||||
# and it will simplify the code in limited_request_func.
|
|
||||||
# semaphore = (asyncio.Semaphore(max_concurrency)
|
|
||||||
# if max_concurrency else contextlib.nullcontext())
|
|
||||||
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
|
|
||||||
|
|
||||||
async def limited_request_func(request_func_input, pbar):
|
async def limited_request_func(request_func_input, pbar):
|
||||||
if semaphore is None:
|
|
||||||
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
||||||
|
|
||||||
|
|||||||
@ -6,7 +6,7 @@ import math
|
|||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
from types import TracebackType
|
from types import TracebackType
|
||||||
from typing import Any, Optional, Union
|
from typing import Any
|
||||||
|
|
||||||
|
|
||||||
def convert_to_pytorch_benchmark_format(
|
def convert_to_pytorch_benchmark_format(
|
||||||
@ -92,7 +92,7 @@ class TimeCollector:
|
|||||||
def __init__(self, scale: int) -> None:
|
def __init__(self, scale: int) -> None:
|
||||||
self.cnt: int = 0
|
self.cnt: int = 0
|
||||||
self._sum: int = 0
|
self._sum: int = 0
|
||||||
self._max: Optional[int] = None
|
self._max: int | None = None
|
||||||
self.scale = scale
|
self.scale = scale
|
||||||
self.start_time: int = time.monotonic_ns()
|
self.start_time: int = time.monotonic_ns()
|
||||||
|
|
||||||
@ -104,13 +104,13 @@ class TimeCollector:
|
|||||||
else:
|
else:
|
||||||
self._max = max(self._max, v)
|
self._max = max(self._max, v)
|
||||||
|
|
||||||
def avg(self) -> Union[float, str]:
|
def avg(self) -> float | str:
|
||||||
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
|
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
|
||||||
|
|
||||||
def max(self) -> Union[float, str]:
|
def max(self) -> float | str:
|
||||||
return self._max / self.scale if self._max else "N/A"
|
return self._max / self.scale if self._max else "N/A"
|
||||||
|
|
||||||
def dump_avg_max(self) -> list[Union[float, str]]:
|
def dump_avg_max(self) -> list[float | str]:
|
||||||
return [self.avg(), self.max()]
|
return [self.avg(), self.max()]
|
||||||
|
|
||||||
def __enter__(self) -> None:
|
def __enter__(self) -> None:
|
||||||
@ -118,8 +118,8 @@ class TimeCollector:
|
|||||||
|
|
||||||
def __exit__(
|
def __exit__(
|
||||||
self,
|
self,
|
||||||
exc_type: Optional[type[BaseException]],
|
exc_type: type[BaseException] | None,
|
||||||
exc_value: Optional[BaseException],
|
exc_value: BaseException | None,
|
||||||
exc_traceback: Optional[TracebackType],
|
exc_traceback: TracebackType | None,
|
||||||
) -> None:
|
) -> None:
|
||||||
self.collect(time.monotonic_ns() - self.start_time)
|
self.collect(time.monotonic_ns() - self.start_time)
|
||||||
|
|||||||
@ -6,8 +6,7 @@ import copy
|
|||||||
import itertools
|
import itertools
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
from collections.abc import Iterable
|
from collections.abc import Callable, Iterable
|
||||||
from typing import Callable
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
@ -16,7 +15,7 @@ from utils import make_rand_sparse_tensors
|
|||||||
from weight_shapes import WEIGHT_SHAPES
|
from weight_shapes import WEIGHT_SHAPES
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
||||||
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
||||||
|
|||||||
@ -6,8 +6,7 @@ import copy
|
|||||||
import itertools
|
import itertools
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
from collections.abc import Iterable
|
from collections.abc import Callable, Iterable
|
||||||
from typing import Callable, Optional
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
@ -19,7 +18,8 @@ from vllm import _custom_ops as ops
|
|||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||||
w8a8_triton_block_scaled_mm,
|
w8a8_triton_block_scaled_mm,
|
||||||
)
|
)
|
||||||
from vllm.utils import FlexibleArgumentParser, cdiv
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.math_utils import cdiv
|
||||||
|
|
||||||
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
||||||
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
||||||
@ -53,7 +53,7 @@ def bench_int8(
|
|||||||
n: int,
|
n: int,
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
bench_kernels: Optional[list[str]] = None,
|
bench_kernels: list[str] | None = None,
|
||||||
) -> Iterable[TMeasurement]:
|
) -> Iterable[TMeasurement]:
|
||||||
"""Benchmark INT8-based kernels."""
|
"""Benchmark INT8-based kernels."""
|
||||||
assert dtype == torch.int8
|
assert dtype == torch.int8
|
||||||
@ -108,7 +108,7 @@ def bench_fp8(
|
|||||||
n: int,
|
n: int,
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
bench_kernels: Optional[list[str]] = None,
|
bench_kernels: list[str] | None = None,
|
||||||
) -> Iterable[TMeasurement]:
|
) -> Iterable[TMeasurement]:
|
||||||
"""Benchmark FP8-based kernels."""
|
"""Benchmark FP8-based kernels."""
|
||||||
assert dtype == torch.float8_e4m3fn
|
assert dtype == torch.float8_e4m3fn
|
||||||
@ -183,7 +183,7 @@ def bench(
|
|||||||
n: int,
|
n: int,
|
||||||
label: str,
|
label: str,
|
||||||
sub_label: str,
|
sub_label: str,
|
||||||
bench_kernels: Optional[list[str]] = None,
|
bench_kernels: list[str] | None = None,
|
||||||
) -> Iterable[TMeasurement]:
|
) -> Iterable[TMeasurement]:
|
||||||
if dtype == torch.int8:
|
if dtype == torch.int8:
|
||||||
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
|
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
|
||||||
@ -201,7 +201,7 @@ def print_timers(timers: Iterable[TMeasurement]):
|
|||||||
def run(
|
def run(
|
||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
MKNs: Iterable[tuple[int, int, int]],
|
MKNs: Iterable[tuple[int, int, int]],
|
||||||
bench_kernels: Optional[list[str]] = None,
|
bench_kernels: list[str] | None = None,
|
||||||
) -> Iterable[TMeasurement]:
|
) -> Iterable[TMeasurement]:
|
||||||
results = []
|
results = []
|
||||||
for m, k, n in MKNs:
|
for m, k, n in MKNs:
|
||||||
|
|||||||
@ -3,10 +3,9 @@
|
|||||||
|
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
from collections.abc import Iterable
|
from collections.abc import Callable, Iterable
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from itertools import product
|
from itertools import product
|
||||||
from typing import Callable, Optional
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
@ -51,7 +50,7 @@ def get_bench_params() -> list[bench_params_t]:
|
|||||||
def unfused_int8_impl(
|
def unfused_int8_impl(
|
||||||
rms_norm_layer: RMSNorm,
|
rms_norm_layer: RMSNorm,
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: Optional[torch.Tensor],
|
residual: torch.Tensor | None,
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
):
|
):
|
||||||
# Norm
|
# Norm
|
||||||
@ -68,7 +67,7 @@ def unfused_int8_impl(
|
|||||||
def unfused_fp8_impl(
|
def unfused_fp8_impl(
|
||||||
rms_norm_layer: RMSNorm,
|
rms_norm_layer: RMSNorm,
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: Optional[torch.Tensor],
|
residual: torch.Tensor | None,
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
):
|
):
|
||||||
# Norm
|
# Norm
|
||||||
@ -85,7 +84,7 @@ def unfused_fp8_impl(
|
|||||||
def fused_impl(
|
def fused_impl(
|
||||||
rms_norm_layer: RMSNorm, # this stores the weights
|
rms_norm_layer: RMSNorm, # this stores the weights
|
||||||
x: torch.Tensor,
|
x: torch.Tensor,
|
||||||
residual: Optional[torch.Tensor],
|
residual: torch.Tensor | None,
|
||||||
quant_dtype: torch.dtype,
|
quant_dtype: torch.dtype,
|
||||||
):
|
):
|
||||||
out, _ = ops.rms_norm_dynamic_per_token_quant(
|
out, _ = ops.rms_norm_dynamic_per_token_quant(
|
||||||
|
|||||||
191
benchmarks/kernels/bench_mxfp4_qutlass.py
Normal file
191
benchmarks/kernels/bench_mxfp4_qutlass.py
Normal file
@ -0,0 +1,191 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
#
|
||||||
|
# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
|
||||||
|
# All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import copy
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
|
||||||
|
from weight_shapes import WEIGHT_SHAPES
|
||||||
|
|
||||||
|
from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn
|
||||||
|
from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
|
||||||
|
from vllm.triton_utils import triton
|
||||||
|
|
||||||
|
PROVIDER_CFGS = {
|
||||||
|
"torch-bf16": dict(enabled=True),
|
||||||
|
"mxfp4": dict(no_a_quant=False, enabled=True),
|
||||||
|
"mxfp4-noquant": dict(no_a_quant=True, enabled=True),
|
||||||
|
}
|
||||||
|
|
||||||
|
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
|
||||||
|
|
||||||
|
|
||||||
|
def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
|
||||||
|
return (
|
||||||
|
deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
|
||||||
|
* group_size**-0.5
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _quant_weight_mxfp4(
|
||||||
|
b: torch.Tensor, forward_hadamard_matrix: torch.Tensor, device: str
|
||||||
|
):
|
||||||
|
weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeMx(
|
||||||
|
b, forward_hadamard_matrix, method="abs_max"
|
||||||
|
)
|
||||||
|
weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton")
|
||||||
|
return weight_hf_e2m1, weight_hf_scale_block
|
||||||
|
|
||||||
|
|
||||||
|
def build_mxfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device):
|
||||||
|
weight_hf_e2m1, weight_hf_scale_block = _quant_weight_mxfp4(
|
||||||
|
b, forward_hadamard_matrix, device
|
||||||
|
)
|
||||||
|
alpha = torch.tensor([1.0], device="cuda")
|
||||||
|
|
||||||
|
if cfg["no_a_quant"]:
|
||||||
|
# Pre-quantize activation
|
||||||
|
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
|
||||||
|
a, forward_hadamard_matrix, method="abs_max"
|
||||||
|
)
|
||||||
|
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
|
||||||
|
|
||||||
|
def run():
|
||||||
|
return matmul_mxf4_bf16_tn(
|
||||||
|
input_hf_e2m1,
|
||||||
|
weight_hf_e2m1,
|
||||||
|
input_hf_scale_block,
|
||||||
|
weight_hf_scale_block,
|
||||||
|
alpha,
|
||||||
|
)
|
||||||
|
|
||||||
|
return run
|
||||||
|
|
||||||
|
# Quantize activation on-the-fly
|
||||||
|
def run():
|
||||||
|
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
|
||||||
|
a, forward_hadamard_matrix, method="abs_max"
|
||||||
|
)
|
||||||
|
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
|
||||||
|
return matmul_mxf4_bf16_tn(
|
||||||
|
input_hf_e2m1,
|
||||||
|
weight_hf_e2m1,
|
||||||
|
input_hf_scale_block,
|
||||||
|
weight_hf_scale_block,
|
||||||
|
alpha,
|
||||||
|
)
|
||||||
|
|
||||||
|
return run
|
||||||
|
|
||||||
|
|
||||||
|
@triton.testing.perf_report(
|
||||||
|
triton.testing.Benchmark(
|
||||||
|
x_names=["batch_size"],
|
||||||
|
x_vals=[
|
||||||
|
1,
|
||||||
|
4,
|
||||||
|
8,
|
||||||
|
16,
|
||||||
|
32,
|
||||||
|
64,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
512,
|
||||||
|
1024,
|
||||||
|
2048,
|
||||||
|
4096,
|
||||||
|
8192,
|
||||||
|
16384,
|
||||||
|
24576,
|
||||||
|
32768,
|
||||||
|
],
|
||||||
|
x_log=False,
|
||||||
|
line_arg="provider",
|
||||||
|
line_vals=_enabled,
|
||||||
|
line_names=_enabled,
|
||||||
|
ylabel="TFLOP/s (larger is better)",
|
||||||
|
plot_name="BF16 vs MXFP4 GEMMs",
|
||||||
|
args={},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
def benchmark(batch_size, provider, N, K, had_size):
|
||||||
|
M = batch_size
|
||||||
|
device = "cuda"
|
||||||
|
dtype = torch.bfloat16
|
||||||
|
|
||||||
|
a = torch.randn((M, K), device=device, dtype=dtype)
|
||||||
|
b = torch.randn((N, K), device=device, dtype=dtype)
|
||||||
|
forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
|
||||||
|
|
||||||
|
quantiles = [0.5, 0.2, 0.8]
|
||||||
|
|
||||||
|
if provider == "torch-bf16":
|
||||||
|
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||||
|
lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
cfg = PROVIDER_CFGS[provider]
|
||||||
|
run_quant = build_mxfp4_runner(
|
||||||
|
cfg, a, b, forward_hadamard_matrix, dtype, device
|
||||||
|
)
|
||||||
|
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||||
|
lambda: run_quant(), rep=200, quantiles=quantiles
|
||||||
|
)
|
||||||
|
|
||||||
|
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
|
||||||
|
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_shapes(args):
|
||||||
|
out = []
|
||||||
|
for model, tp_size in itertools.product(args.models, args.tp_sizes):
|
||||||
|
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
|
||||||
|
KN[tp_dim] //= tp_size
|
||||||
|
KN.append(model)
|
||||||
|
out.append(KN)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"--models",
|
||||||
|
nargs="+",
|
||||||
|
type=str,
|
||||||
|
default=["meta-llama/Llama-3.3-70B-Instruct"],
|
||||||
|
choices=list(WEIGHT_SHAPES.keys()),
|
||||||
|
)
|
||||||
|
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
for K, N, model in prepare_shapes(args):
|
||||||
|
for had_size in [32, 64, 128]:
|
||||||
|
print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs MXFP4 GEMMs TFLOP/s:")
|
||||||
|
benchmark.run(
|
||||||
|
print_data=True,
|
||||||
|
show_plots=True,
|
||||||
|
save_path=f"bench_mxfp4_res_n{N}_k{K}",
|
||||||
|
N=N,
|
||||||
|
K=K,
|
||||||
|
had_size=had_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Benchmark finished!")
|
||||||
207
benchmarks/kernels/bench_nvfp4_qutlass.py
Normal file
207
benchmarks/kernels/bench_nvfp4_qutlass.py
Normal file
@ -0,0 +1,207 @@
|
|||||||
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
|
#
|
||||||
|
# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
|
||||||
|
# All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import copy
|
||||||
|
import itertools
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
|
||||||
|
from weight_shapes import WEIGHT_SHAPES
|
||||||
|
|
||||||
|
from vllm import _custom_ops as ops # use existing nvfp4 gemm in vllm
|
||||||
|
from vllm._custom_ops import fusedQuantizeNv
|
||||||
|
from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
|
||||||
|
from vllm.triton_utils import triton
|
||||||
|
|
||||||
|
PROVIDER_CFGS = {
|
||||||
|
"torch-bf16": dict(enabled=True),
|
||||||
|
"nvfp4": dict(no_a_quant=False, enabled=True),
|
||||||
|
"nvfp4-noquant": dict(no_a_quant=True, enabled=True),
|
||||||
|
}
|
||||||
|
|
||||||
|
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
|
||||||
|
|
||||||
|
|
||||||
|
def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
|
||||||
|
return (
|
||||||
|
deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
|
||||||
|
* group_size**-0.5
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _quant_weight_nvfp4(
|
||||||
|
b: torch.Tensor,
|
||||||
|
forward_hadamard_matrix: torch.Tensor,
|
||||||
|
global_scale: torch.Tensor,
|
||||||
|
device: str,
|
||||||
|
M: int,
|
||||||
|
N: int,
|
||||||
|
K: int,
|
||||||
|
):
|
||||||
|
weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeNv(
|
||||||
|
b, forward_hadamard_matrix, global_scale
|
||||||
|
)
|
||||||
|
weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton").view(
|
||||||
|
-1, K // 16
|
||||||
|
)
|
||||||
|
return weight_hf_e2m1, weight_hf_scale_block
|
||||||
|
|
||||||
|
|
||||||
|
def build_nvfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K):
|
||||||
|
alpha = torch.tensor([1.0], device="cuda")
|
||||||
|
global_scale = torch.tensor([1.0], device="cuda")
|
||||||
|
weight_hf_e2m1, weight_hf_scale_block = _quant_weight_nvfp4(
|
||||||
|
b, forward_hadamard_matrix, global_scale, device, M, N, K
|
||||||
|
)
|
||||||
|
|
||||||
|
if cfg["no_a_quant"]:
|
||||||
|
# Pre-quantize activation
|
||||||
|
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
|
||||||
|
a, forward_hadamard_matrix, global_scale
|
||||||
|
)
|
||||||
|
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
|
||||||
|
-1, K // 16
|
||||||
|
)
|
||||||
|
|
||||||
|
def run():
|
||||||
|
return ops.cutlass_scaled_fp4_mm(
|
||||||
|
input_hf_e2m1,
|
||||||
|
weight_hf_e2m1,
|
||||||
|
input_hf_scale_block,
|
||||||
|
weight_hf_scale_block,
|
||||||
|
alpha,
|
||||||
|
torch.bfloat16,
|
||||||
|
)
|
||||||
|
|
||||||
|
return run
|
||||||
|
|
||||||
|
# Quantize activation on-the-fly
|
||||||
|
def run():
|
||||||
|
input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
|
||||||
|
a, forward_hadamard_matrix, global_scale
|
||||||
|
)
|
||||||
|
input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
|
||||||
|
-1, K // 16
|
||||||
|
)
|
||||||
|
return ops.cutlass_scaled_fp4_mm(
|
||||||
|
input_hf_e2m1,
|
||||||
|
weight_hf_e2m1,
|
||||||
|
input_hf_scale_block,
|
||||||
|
weight_hf_scale_block,
|
||||||
|
alpha,
|
||||||
|
torch.bfloat16,
|
||||||
|
)
|
||||||
|
|
||||||
|
return run
|
||||||
|
|
||||||
|
|
||||||
|
@triton.testing.perf_report(
|
||||||
|
triton.testing.Benchmark(
|
||||||
|
x_names=["batch_size"],
|
||||||
|
x_vals=[
|
||||||
|
1,
|
||||||
|
4,
|
||||||
|
8,
|
||||||
|
16,
|
||||||
|
32,
|
||||||
|
64,
|
||||||
|
128,
|
||||||
|
256,
|
||||||
|
512,
|
||||||
|
1024,
|
||||||
|
2048,
|
||||||
|
4096,
|
||||||
|
8192,
|
||||||
|
16384,
|
||||||
|
24576,
|
||||||
|
32768,
|
||||||
|
],
|
||||||
|
x_log=False,
|
||||||
|
line_arg="provider",
|
||||||
|
line_vals=_enabled,
|
||||||
|
line_names=_enabled,
|
||||||
|
ylabel="TFLOP/s (larger is better)",
|
||||||
|
plot_name="BF16 vs NVFP4 GEMMs",
|
||||||
|
args={},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
def benchmark(batch_size, provider, N, K, had_size):
|
||||||
|
M = batch_size
|
||||||
|
device = "cuda"
|
||||||
|
dtype = torch.bfloat16
|
||||||
|
|
||||||
|
a = torch.randn((M, K), device=device, dtype=dtype)
|
||||||
|
b = torch.randn((N, K), device=device, dtype=dtype)
|
||||||
|
forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
|
||||||
|
|
||||||
|
quantiles = [0.5, 0.2, 0.8]
|
||||||
|
|
||||||
|
if provider == "torch-bf16":
|
||||||
|
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||||
|
lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
cfg = PROVIDER_CFGS[provider]
|
||||||
|
run_quant = build_nvfp4_runner(
|
||||||
|
cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K
|
||||||
|
)
|
||||||
|
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||||
|
lambda: run_quant(), rep=200, quantiles=quantiles
|
||||||
|
)
|
||||||
|
|
||||||
|
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
|
||||||
|
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
|
||||||
|
|
||||||
|
|
||||||
|
def prepare_shapes(args):
|
||||||
|
out = []
|
||||||
|
for model, tp_size in itertools.product(args.models, args.tp_sizes):
|
||||||
|
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
|
||||||
|
KN[tp_dim] //= tp_size
|
||||||
|
KN.append(model)
|
||||||
|
out.append(KN)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument(
|
||||||
|
"--models",
|
||||||
|
nargs="+",
|
||||||
|
type=str,
|
||||||
|
default=["meta-llama/Llama-3.3-70B-Instruct"],
|
||||||
|
choices=list(WEIGHT_SHAPES.keys()),
|
||||||
|
)
|
||||||
|
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
for K, N, model in prepare_shapes(args):
|
||||||
|
for had_size in [16, 32, 64, 128]:
|
||||||
|
print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs NVFP4 GEMMs TFLOP/s:")
|
||||||
|
benchmark.run(
|
||||||
|
print_data=True,
|
||||||
|
show_plots=True,
|
||||||
|
save_path=f"bench_nvfp4_res_n{N}_k{K}",
|
||||||
|
N=N,
|
||||||
|
K=K,
|
||||||
|
had_size=had_size,
|
||||||
|
)
|
||||||
|
|
||||||
|
print("Benchmark finished!")
|
||||||
@ -1,7 +1,7 @@
|
|||||||
# SPDX-License-Identifier: Apache-2.0
|
# SPDX-License-Identifier: Apache-2.0
|
||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||||
import itertools
|
import itertools
|
||||||
from typing import Callable
|
from collections.abc import Callable
|
||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@ -10,7 +10,8 @@ import torch
|
|||||||
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
|
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
|
||||||
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
|
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
|
||||||
from vllm.triton_utils import triton
|
from vllm.triton_utils import triton
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||||
|
|
||||||
|
|
||||||
def with_triton_mode(fn):
|
def with_triton_mode(fn):
|
||||||
|
|||||||
@ -10,7 +10,8 @@ import vllm.model_executor.layers.activation # noqa F401
|
|||||||
from vllm.model_executor.custom_op import CustomOp
|
from vllm.model_executor.custom_op import CustomOp
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.triton_utils import triton
|
from vllm.triton_utils import triton
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||||
|
|
||||||
batch_size_range = [1, 16, 32, 64, 128]
|
batch_size_range = [1, 16, 32, 64, 128]
|
||||||
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
|
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
|
||||||
|
|||||||
@ -28,7 +28,7 @@ except ImportError as e:
|
|||||||
|
|
||||||
from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
|
from bitblas import Matmul, MatmulConfig, auto_detect_nvidia_target
|
||||||
|
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description="Benchmark BitBLAS int4 on a specific target."
|
description="Benchmark BitBLAS int4 on a specific target."
|
||||||
|
|||||||
@ -20,7 +20,7 @@ from vllm.model_executor.layers.fused_moe.config import (
|
|||||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
|
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
|
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
|
||||||
from vllm.scalar_type import scalar_types
|
from vllm.scalar_type import scalar_types
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
WEIGHT_SHAPES_MOE = {
|
WEIGHT_SHAPES_MOE = {
|
||||||
"nvidia/DeepSeek-R1-FP4": [
|
"nvidia/DeepSeek-R1-FP4": [
|
||||||
|
|||||||
@ -14,7 +14,7 @@ from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_confi
|
|||||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
|
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
|
||||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
|
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
# Weight shapes for different models: [num_experts, topk, hidden_size,
|
# Weight shapes for different models: [num_experts, topk, hidden_size,
|
||||||
# intermediate_size]
|
# intermediate_size]
|
||||||
|
|||||||
@ -22,8 +22,8 @@ Example:
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
from collections.abc import Callable
|
||||||
from contextlib import nullcontext
|
from contextlib import nullcontext
|
||||||
from typing import Callable, Optional
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
@ -39,7 +39,7 @@ from vllm.distributed.device_communicators.pynccl_allocator import (
|
|||||||
)
|
)
|
||||||
from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
|
from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
logger = init_logger(__name__)
|
logger = init_logger(__name__)
|
||||||
|
|
||||||
@ -264,12 +264,12 @@ class CommunicatorBenchmark:
|
|||||||
def benchmark_allreduce_single(
|
def benchmark_allreduce_single(
|
||||||
self,
|
self,
|
||||||
sequence_length: int,
|
sequence_length: int,
|
||||||
allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
|
allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None],
|
||||||
should_use_fn: Callable[[torch.Tensor], bool],
|
should_use_fn: Callable[[torch.Tensor], bool],
|
||||||
context,
|
context,
|
||||||
num_warmup: int,
|
num_warmup: int,
|
||||||
num_trials: int,
|
num_trials: int,
|
||||||
) -> Optional[float]:
|
) -> float | None:
|
||||||
"""Benchmark method with CUDA graph optimization."""
|
"""Benchmark method with CUDA graph optimization."""
|
||||||
try:
|
try:
|
||||||
# Create test tensor (2D: sequence_length x hidden_size)
|
# Create test tensor (2D: sequence_length x hidden_size)
|
||||||
|
|||||||
@ -13,11 +13,11 @@ from vllm.model_executor.layers.fused_moe.fused_moe import (
|
|||||||
fused_experts,
|
fused_experts,
|
||||||
fused_topk,
|
fused_topk,
|
||||||
)
|
)
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
DEFAULT_MODELS = [
|
DEFAULT_MODELS = [
|
||||||
"nm-testing/Mixtral-8x7B-Instruct-v0.1",
|
"mistralai/Mixtral-8x7B-Instruct-v0.1",
|
||||||
"nm-testing/deepseekv2-lite",
|
"deepseek-ai/DeepSeek-V2-Lite",
|
||||||
"ibm-granite/granite-3.0-1b-a400m",
|
"ibm-granite/granite-3.0-1b-a400m",
|
||||||
"ibm-granite/granite-3.0-3b-a800m",
|
"ibm-granite/granite-3.0-3b-a800m",
|
||||||
]
|
]
|
||||||
|
|||||||
@ -7,7 +7,8 @@ import torch
|
|||||||
|
|
||||||
from vllm.model_executor.layers.layernorm import RMSNorm
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
|
||||||
|
|
||||||
|
|
||||||
@torch.inference_mode()
|
@torch.inference_mode()
|
||||||
|
|||||||
@ -6,11 +6,12 @@ import copy
|
|||||||
import json
|
import json
|
||||||
import pickle
|
import pickle
|
||||||
import time
|
import time
|
||||||
|
from collections.abc import Callable
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from enum import Enum, auto
|
from enum import Enum, auto
|
||||||
from itertools import product
|
from itertools import product
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Callable, Optional
|
from typing import Any
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.utils.benchmark as TBenchmark
|
import torch.utils.benchmark as TBenchmark
|
||||||
@ -18,13 +19,24 @@ from torch.utils.benchmark import Measurement as TMeasurement
|
|||||||
from utils import ArgPool, Bench, CudaGraphBenchParams
|
from utils import ArgPool, Bench, CudaGraphBenchParams
|
||||||
from weight_shapes import WEIGHT_SHAPES
|
from weight_shapes import WEIGHT_SHAPES
|
||||||
|
|
||||||
from vllm.triton_utils import HAS_TRITON
|
from vllm.lora.ops.triton_ops.utils import get_lora_op_configs
|
||||||
|
from vllm.triton_utils import HAS_TRITON, triton
|
||||||
|
|
||||||
if HAS_TRITON:
|
if HAS_TRITON:
|
||||||
from vllm.lora.ops.triton_ops import LoRAKernelMeta, lora_expand, lora_shrink
|
from vllm.lora.ops.triton_ops import ( ## added fused_moe_lora
|
||||||
|
LoRAKernelMeta,
|
||||||
|
fused_moe_lora_expand,
|
||||||
|
fused_moe_lora_shrink,
|
||||||
|
lora_expand,
|
||||||
|
lora_shrink,
|
||||||
|
)
|
||||||
|
from vllm.lora.ops.triton_ops.fused_moe_lora_op import (
|
||||||
|
_LORA_PTR_DICT, ## added _LORA_PTR_DICT for fused_moe_lora
|
||||||
|
)
|
||||||
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
|
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
|
||||||
|
from vllm import _custom_ops as ops
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.math_utils import round_up
|
||||||
|
|
||||||
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
||||||
DEFAULT_TP_SIZES = [1]
|
DEFAULT_TP_SIZES = [1]
|
||||||
@ -58,6 +70,8 @@ DEFAULT_NUM_LORAS = [1, 2, 3, 4]
|
|||||||
DEFAULT_SORT_BY_LORA_IDS = [False, True]
|
DEFAULT_SORT_BY_LORA_IDS = [False, True]
|
||||||
DEFAULT_SEQ_LENGTHS = [1]
|
DEFAULT_SEQ_LENGTHS = [1]
|
||||||
DEFAULT_EXPAND_FN_ADD_INPUTS = [True, False]
|
DEFAULT_EXPAND_FN_ADD_INPUTS = [True, False]
|
||||||
|
DEFAULT_TOP_K_NUMS = [1] # Added for MoE LoRA top_k
|
||||||
|
DEFAULT_NUM_EXPERTS = [8] # Added for MoE LoRA num_experts
|
||||||
|
|
||||||
|
|
||||||
# Utilities
|
# Utilities
|
||||||
@ -158,7 +172,7 @@ def ref_group_gemm(
|
|||||||
seq_lens_cpu: torch.Tensor,
|
seq_lens_cpu: torch.Tensor,
|
||||||
prompt_lora_mapping_cpu: torch.Tensor,
|
prompt_lora_mapping_cpu: torch.Tensor,
|
||||||
scaling: float,
|
scaling: float,
|
||||||
add_inputs: Optional[bool],
|
add_inputs: bool | None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Torch group gemm reference implementation to test correctness of
|
Torch group gemm reference implementation to test correctness of
|
||||||
@ -190,6 +204,11 @@ class OpType(Enum):
|
|||||||
|
|
||||||
LORA_SHRINK = auto()
|
LORA_SHRINK = auto()
|
||||||
LORA_EXPAND = auto()
|
LORA_EXPAND = auto()
|
||||||
|
## Adding support for fused moe lora
|
||||||
|
FUSED_MOE_LORA_GATE_UP_SHRINK = auto() ## Gate/Up projection variant with shrink
|
||||||
|
FUSED_MOE_LORA_GATE_UP_EXPAND = auto() ## Gate/Up projection variant with expand
|
||||||
|
FUSED_MOE_LORA_DOWN_SHRINK = auto() ## Down projection variant with shrink
|
||||||
|
FUSED_MOE_LORA_DOWN_EXPAND = auto() ## Down projection variant with expand
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def from_str(s: str) -> "OpType":
|
def from_str(s: str) -> "OpType":
|
||||||
@ -197,6 +216,15 @@ class OpType(Enum):
|
|||||||
return OpType.LORA_SHRINK
|
return OpType.LORA_SHRINK
|
||||||
if s.lower() == "lora_expand":
|
if s.lower() == "lora_expand":
|
||||||
return OpType.LORA_EXPAND
|
return OpType.LORA_EXPAND
|
||||||
|
# Adding support for fused moe lora, both in gate_up and down
|
||||||
|
if s.lower() == "fused_moe_lora_gate_up_shrink": ## Gate/Up variant with shrink
|
||||||
|
return OpType.FUSED_MOE_LORA_GATE_UP_SHRINK
|
||||||
|
if s.lower() == "fused_moe_lora_gate_up_expand": ## Gate/Up variant with expand
|
||||||
|
return OpType.FUSED_MOE_LORA_GATE_UP_EXPAND
|
||||||
|
if s.lower() == "fused_moe_lora_down_shrink": ## Down variant with shrink
|
||||||
|
return OpType.FUSED_MOE_LORA_DOWN_SHRINK
|
||||||
|
if s.lower() == "fused_moe_lora_down_expand": ## Down variant with expand
|
||||||
|
return OpType.FUSED_MOE_LORA_DOWN_EXPAND
|
||||||
raise ValueError(f"Unrecognized str {s} to convert to OpType")
|
raise ValueError(f"Unrecognized str {s} to convert to OpType")
|
||||||
|
|
||||||
def is_shrink_fn(self) -> bool:
|
def is_shrink_fn(self) -> bool:
|
||||||
@ -205,19 +233,56 @@ class OpType(Enum):
|
|||||||
def is_expand_fn(self) -> bool:
|
def is_expand_fn(self) -> bool:
|
||||||
return self in [OpType.LORA_EXPAND]
|
return self in [OpType.LORA_EXPAND]
|
||||||
|
|
||||||
|
def is_fused_moe_lora_fn(self) -> bool: ## adding for fused MoE LoRA
|
||||||
|
return self in [
|
||||||
|
OpType.FUSED_MOE_LORA_GATE_UP_SHRINK,
|
||||||
|
OpType.FUSED_MOE_LORA_DOWN_SHRINK,
|
||||||
|
OpType.FUSED_MOE_LORA_GATE_UP_EXPAND,
|
||||||
|
OpType.FUSED_MOE_LORA_DOWN_EXPAND,
|
||||||
|
]
|
||||||
|
|
||||||
|
def is_fused_moe_lora_gate_up_fn(
|
||||||
|
self,
|
||||||
|
) -> bool: ## adding for fused MoE LoRA Gate/Up
|
||||||
|
return self in [
|
||||||
|
OpType.FUSED_MOE_LORA_GATE_UP_SHRINK,
|
||||||
|
OpType.FUSED_MOE_LORA_GATE_UP_EXPAND,
|
||||||
|
]
|
||||||
|
|
||||||
|
def is_fused_moe_lora_down_fn(self) -> bool: ## adding for fused MoE LoRA Down
|
||||||
|
return self in [
|
||||||
|
OpType.FUSED_MOE_LORA_DOWN_SHRINK,
|
||||||
|
OpType.FUSED_MOE_LORA_DOWN_EXPAND,
|
||||||
|
]
|
||||||
|
|
||||||
|
def is_fused_moe_lora_shrink_fn(self) -> bool:
|
||||||
|
return self in [
|
||||||
|
OpType.FUSED_MOE_LORA_GATE_UP_SHRINK,
|
||||||
|
OpType.FUSED_MOE_LORA_DOWN_SHRINK,
|
||||||
|
]
|
||||||
|
|
||||||
|
def is_fused_moe_lora_expand_fn(self) -> bool:
|
||||||
|
return self in [
|
||||||
|
OpType.FUSED_MOE_LORA_GATE_UP_EXPAND,
|
||||||
|
OpType.FUSED_MOE_LORA_DOWN_EXPAND,
|
||||||
|
]
|
||||||
|
|
||||||
def num_slices(self) -> list[int]:
|
def num_slices(self) -> list[int]:
|
||||||
|
if self.is_fused_moe_lora_gate_up_fn():
|
||||||
|
return [2]
|
||||||
|
elif self.is_fused_moe_lora_down_fn():
|
||||||
|
return [1]
|
||||||
return [1, 2, 3]
|
return [1, 2, 3]
|
||||||
|
|
||||||
def mkn(
|
def mkn(
|
||||||
self, batch_size: int, seq_length: int, hidden_size: int, lora_rank: int
|
self, batch_size: int, seq_length: int, hidden_size: int, lora_rank: int
|
||||||
) -> tuple[int, int, int]:
|
) -> tuple[int, int, int]:
|
||||||
num_tokens = batch_size * seq_length
|
num_tokens = batch_size * seq_length
|
||||||
if self.is_shrink_fn():
|
if self.is_shrink_fn() or self.is_fused_moe_lora_fn():
|
||||||
m = num_tokens
|
m = num_tokens
|
||||||
k = hidden_size
|
k = hidden_size
|
||||||
n = lora_rank
|
n = lora_rank
|
||||||
else:
|
elif self.is_expand_fn():
|
||||||
assert self.is_expand_fn()
|
|
||||||
m = num_tokens
|
m = num_tokens
|
||||||
k = lora_rank
|
k = lora_rank
|
||||||
n = hidden_size
|
n = hidden_size
|
||||||
@ -231,9 +296,36 @@ class OpType(Enum):
|
|||||||
"""
|
"""
|
||||||
if self.is_shrink_fn():
|
if self.is_shrink_fn():
|
||||||
return op_dtype, op_dtype, torch.float32
|
return op_dtype, op_dtype, torch.float32
|
||||||
else:
|
elif self.is_expand_fn():
|
||||||
assert self.is_expand_fn()
|
|
||||||
return torch.float32, op_dtype, op_dtype
|
return torch.float32, op_dtype, op_dtype
|
||||||
|
else:
|
||||||
|
assert self.is_fused_moe_lora_fn()
|
||||||
|
return op_dtype, op_dtype, op_dtype
|
||||||
|
|
||||||
|
def matmul_shapes_fused_moe_lora(
|
||||||
|
self,
|
||||||
|
m: int,
|
||||||
|
n: int,
|
||||||
|
k: int,
|
||||||
|
num_loras: int,
|
||||||
|
num_slices: int,
|
||||||
|
top_k_num: int,
|
||||||
|
num_experts: int,
|
||||||
|
) -> tuple[tuple[int], tuple[int], tuple[int], tuple[int]]:
|
||||||
|
if self.is_fused_moe_lora_shrink_fn():
|
||||||
|
input_shape = (
|
||||||
|
(m * top_k_num, n)
|
||||||
|
if self in [OpType.FUSED_MOE_LORA_DOWN_SHRINK]
|
||||||
|
else (m, n)
|
||||||
|
)
|
||||||
|
output_shape = (num_slices, m, top_k_num, k)
|
||||||
|
weight_shape = (num_loras, num_experts, k, n)
|
||||||
|
else:
|
||||||
|
assert self.is_fused_moe_lora_expand_fn()
|
||||||
|
input_shape = (num_slices, m, top_k_num, k)
|
||||||
|
output_shape = (m, top_k_num, n * num_slices)
|
||||||
|
weight_shape = (num_loras, num_experts, n, k)
|
||||||
|
return (input_shape, weight_shape, output_shape)
|
||||||
|
|
||||||
def matmul_shapes(
|
def matmul_shapes(
|
||||||
self,
|
self,
|
||||||
@ -243,6 +335,8 @@ class OpType(Enum):
|
|||||||
lora_rank: int,
|
lora_rank: int,
|
||||||
num_loras: int,
|
num_loras: int,
|
||||||
num_slices: int,
|
num_slices: int,
|
||||||
|
top_k_num: int | None = None,
|
||||||
|
num_experts: int | None = None,
|
||||||
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
|
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
|
||||||
"""
|
"""
|
||||||
Given num_slices, return the shapes of the A, B, and C matrices
|
Given num_slices, return the shapes of the A, B, and C matrices
|
||||||
@ -257,6 +351,16 @@ class OpType(Enum):
|
|||||||
if self in [OpType.LORA_EXPAND]:
|
if self in [OpType.LORA_EXPAND]:
|
||||||
# LoRA expand kernels support num_slices inherently in the kernel
|
# LoRA expand kernels support num_slices inherently in the kernel
|
||||||
return ((num_slices, m, k), b_shape, (m, n * num_slices))
|
return ((num_slices, m, k), b_shape, (m, n * num_slices))
|
||||||
|
if self.is_fused_moe_lora_fn():
|
||||||
|
return self.matmul_shapes_fused_moe_lora(
|
||||||
|
m,
|
||||||
|
k,
|
||||||
|
n,
|
||||||
|
num_loras,
|
||||||
|
num_slices,
|
||||||
|
top_k_num,
|
||||||
|
num_experts,
|
||||||
|
)
|
||||||
raise ValueError(f"Unrecognized op_type {self}")
|
raise ValueError(f"Unrecognized op_type {self}")
|
||||||
|
|
||||||
def bench_fn(self) -> Callable:
|
def bench_fn(self) -> Callable:
|
||||||
@ -264,6 +368,16 @@ class OpType(Enum):
|
|||||||
return lora_shrink
|
return lora_shrink
|
||||||
if self == OpType.LORA_EXPAND:
|
if self == OpType.LORA_EXPAND:
|
||||||
return lora_expand
|
return lora_expand
|
||||||
|
if self in [
|
||||||
|
OpType.FUSED_MOE_LORA_GATE_UP_SHRINK,
|
||||||
|
OpType.FUSED_MOE_LORA_DOWN_SHRINK,
|
||||||
|
]:
|
||||||
|
return fused_moe_lora_shrink
|
||||||
|
if self in [
|
||||||
|
OpType.FUSED_MOE_LORA_GATE_UP_EXPAND,
|
||||||
|
OpType.FUSED_MOE_LORA_DOWN_EXPAND,
|
||||||
|
]:
|
||||||
|
return fused_moe_lora_expand
|
||||||
|
|
||||||
raise ValueError(f"Unrecognized optype {self}")
|
raise ValueError(f"Unrecognized optype {self}")
|
||||||
|
|
||||||
@ -316,8 +430,10 @@ class BenchmarkContext:
|
|||||||
lora_rank: int
|
lora_rank: int
|
||||||
sort_by_lora_id: bool
|
sort_by_lora_id: bool
|
||||||
dtype: torch.dtype
|
dtype: torch.dtype
|
||||||
seq_length: Optional[int] = None
|
seq_length: int | None = None
|
||||||
num_slices: Optional[int] = None # num_slices for slice based ops
|
num_experts: int | None = None # num_experts for MoE based ops
|
||||||
|
top_k_num: int | None = None # top_k for MoE based ops
|
||||||
|
num_slices: int | None = None # num_slices for slice based ops
|
||||||
|
|
||||||
def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
|
def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
|
||||||
ctx = copy.copy(self)
|
ctx = copy.copy(self)
|
||||||
@ -372,6 +488,11 @@ class BenchmarkTensors:
|
|||||||
f"{dtype_to_str(self.output.dtype)}"
|
f"{dtype_to_str(self.output.dtype)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def get_num_tokens(self, size: int, top_k_num: int, op_type: OpType):
|
||||||
|
return (
|
||||||
|
size * top_k_num if op_type in [OpType.FUSED_MOE_LORA_DOWN_SHRINK] else size
|
||||||
|
)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def make(
|
def make(
|
||||||
ctx: BenchmarkContext, op_type: OpType, device: str = "cuda"
|
ctx: BenchmarkContext, op_type: OpType, device: str = "cuda"
|
||||||
@ -384,6 +505,8 @@ class BenchmarkTensors:
|
|||||||
ctx.lora_rank,
|
ctx.lora_rank,
|
||||||
ctx.num_loras,
|
ctx.num_loras,
|
||||||
ctx.num_slices,
|
ctx.num_slices,
|
||||||
|
ctx.top_k_num,
|
||||||
|
ctx.num_experts,
|
||||||
)
|
)
|
||||||
a_type, b_type, c_type = op_type.matmul_dtypes(ctx.dtype)
|
a_type, b_type, c_type = op_type.matmul_dtypes(ctx.dtype)
|
||||||
input_tensor, lora_weights, output_tensor = make_rand_tensors(
|
input_tensor, lora_weights, output_tensor = make_rand_tensors(
|
||||||
@ -431,17 +554,27 @@ class BenchmarkTensors:
|
|||||||
prompt_lora_indices_tensor,
|
prompt_lora_indices_tensor,
|
||||||
)
|
)
|
||||||
|
|
||||||
def sanity_check(self) -> None:
|
def sanity_check(self, ctx: BenchmarkContext, op_type: OpType) -> None:
|
||||||
"""
|
"""
|
||||||
Fails asserts when non-conformality is detected.
|
Fails asserts when non-conformality is detected.
|
||||||
"""
|
"""
|
||||||
num_tokens = self.input.shape[-2]
|
num_tokens = (
|
||||||
|
self.input.shape[1]
|
||||||
|
if op_type.is_fused_moe_lora_expand_fn()
|
||||||
|
else self.input.shape[-2]
|
||||||
|
)
|
||||||
# check metadata tensors
|
# check metadata tensors
|
||||||
assert torch.sum(self.seq_lens) == num_tokens
|
## In down shrink case, each token is repeated top_k_num times
|
||||||
|
assert num_tokens == self.get_num_tokens(
|
||||||
|
torch.sum(self.seq_lens), ctx.top_k_num, op_type
|
||||||
|
), f"Expected {num_tokens} tokens, but got {torch.sum(self.seq_lens)}"
|
||||||
num_seqs = self.seq_lens.shape[0]
|
num_seqs = self.seq_lens.shape[0]
|
||||||
# assert self.seq_start_loc.shape[0] == num_seqs
|
# assert self.seq_start_loc.shape[0] == num_seqs
|
||||||
|
## In down shrink case, each prompt corresponds to top_k_num sequences
|
||||||
assert self.prompt_lora_mapping.shape[0] == num_seqs
|
assert self.prompt_lora_mapping.shape[0] == num_seqs
|
||||||
assert self.lora_kernel_meta.token_lora_mapping.shape[0] == num_tokens
|
assert self.get_num_tokens(
|
||||||
|
self.lora_kernel_meta.token_lora_mapping.shape[0], ctx.top_k_num, op_type
|
||||||
|
)
|
||||||
|
|
||||||
def to_device(self, device: str):
|
def to_device(self, device: str):
|
||||||
"""
|
"""
|
||||||
@ -470,21 +603,111 @@ class BenchmarkTensors:
|
|||||||
to_device(field) if field_name != "no_lora_flag_cpu" else field,
|
to_device(field) if field_name != "no_lora_flag_cpu" else field,
|
||||||
)
|
)
|
||||||
|
|
||||||
def metadata(self) -> tuple[int, int, int]:
|
def metadata(self, ctx: BenchmarkContext, op_type: OpType) -> tuple[int, int, int]:
|
||||||
"""
|
"""
|
||||||
Return num_seqs, num_tokens and max_seq_len
|
Return num_seqs, num_tokens and max_seq_len
|
||||||
"""
|
"""
|
||||||
num_seqs = self.seq_lens.shape[0]
|
num_seqs = self.seq_lens.shape[0]
|
||||||
num_tokens = self.lora_kernel_meta.token_lora_mapping.shape[0]
|
num_tokens = self.get_num_tokens(
|
||||||
|
self.lora_kernel_meta.token_lora_mapping.shape[0], ctx.top_k_num, op_type
|
||||||
|
)
|
||||||
max_seq_len = torch.max(self.seq_lens).item()
|
max_seq_len = torch.max(self.seq_lens).item()
|
||||||
num_slices = len(self.lora_weights_lst)
|
num_slices = len(self.lora_weights_lst)
|
||||||
return num_seqs, num_tokens, max_seq_len, num_slices
|
return num_seqs, num_tokens, max_seq_len, num_slices
|
||||||
|
|
||||||
def as_lora_shrink_kwargs(self) -> dict[str, Any]:
|
def fused_moe_lora_data_prepare(
|
||||||
self.sanity_check()
|
self,
|
||||||
|
block_size: int,
|
||||||
|
token_lora_mapping: torch.Tensor,
|
||||||
|
ctx: BenchmarkContext,
|
||||||
|
):
|
||||||
|
def moe_lora_align_block_size(
|
||||||
|
topk_ids: torch.Tensor,
|
||||||
|
token_lora_mapping: torch.Tensor,
|
||||||
|
block_size: int,
|
||||||
|
num_experts: int,
|
||||||
|
max_loras: int,
|
||||||
|
expert_map: torch.Tensor | None = None,
|
||||||
|
pad_sorted_ids: bool = False,
|
||||||
|
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
||||||
|
"""
|
||||||
|
Aligns tokens and experts into block-sized chunks for LoRA-based
|
||||||
|
mixture-of-experts (MoE) execution.
|
||||||
|
"""
|
||||||
|
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
|
||||||
|
if pad_sorted_ids:
|
||||||
|
max_num_tokens_padded = round_up(max_num_tokens_padded, block_size)
|
||||||
|
sorted_ids = torch.empty(
|
||||||
|
(max_loras * max_num_tokens_padded,),
|
||||||
|
dtype=torch.int32,
|
||||||
|
device=topk_ids.device,
|
||||||
|
)
|
||||||
|
max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
|
||||||
|
# Expert ids must be set default to -1 to prevent a blank block
|
||||||
|
expert_ids = torch.empty(
|
||||||
|
(max_loras * max_num_m_blocks,),
|
||||||
|
dtype=torch.int32,
|
||||||
|
device=topk_ids.device,
|
||||||
|
)
|
||||||
|
num_tokens_post_pad = torch.empty(
|
||||||
|
(max_loras), dtype=torch.int32, device=topk_ids.device
|
||||||
|
)
|
||||||
|
|
||||||
|
ops.moe_lora_align_block_size(
|
||||||
|
topk_ids,
|
||||||
|
token_lora_mapping,
|
||||||
|
num_experts,
|
||||||
|
block_size,
|
||||||
|
max_loras,
|
||||||
|
max_num_tokens_padded,
|
||||||
|
max_num_m_blocks,
|
||||||
|
sorted_ids,
|
||||||
|
expert_ids,
|
||||||
|
num_tokens_post_pad,
|
||||||
|
)
|
||||||
|
if expert_map is not None:
|
||||||
|
expert_ids = expert_map[expert_ids]
|
||||||
|
|
||||||
|
return sorted_ids, expert_ids, num_tokens_post_pad
|
||||||
|
|
||||||
|
num_tokens = ctx.batch_size
|
||||||
|
curr_topk_ids = torch.randint(
|
||||||
|
0,
|
||||||
|
ctx.num_experts,
|
||||||
|
(num_tokens, ctx.top_k_num),
|
||||||
|
device="cuda",
|
||||||
|
dtype=torch.int32,
|
||||||
|
)
|
||||||
|
topk_weights = torch.randint(
|
||||||
|
0,
|
||||||
|
ctx.num_experts,
|
||||||
|
(num_tokens, ctx.top_k_num),
|
||||||
|
device="cuda",
|
||||||
|
dtype=torch.int32,
|
||||||
|
)
|
||||||
|
|
||||||
|
(sorted_token_ids_lora, expert_ids_lora, num_tokens_post_padded_lora) = (
|
||||||
|
moe_lora_align_block_size(
|
||||||
|
topk_ids=curr_topk_ids,
|
||||||
|
token_lora_mapping=token_lora_mapping,
|
||||||
|
block_size=block_size,
|
||||||
|
num_experts=ctx.num_experts,
|
||||||
|
max_loras=ctx.num_loras,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
sorted_token_ids = sorted_token_ids_lora.view(ctx.num_loras, -1)
|
||||||
|
expert_ids = expert_ids_lora.view(ctx.num_loras, -1)
|
||||||
|
num_tokens_post_padded = num_tokens_post_padded_lora
|
||||||
|
return (topk_weights, sorted_token_ids, expert_ids, num_tokens_post_padded)
|
||||||
|
|
||||||
|
def as_lora_shrink_kwargs(
|
||||||
|
self, ctx: BenchmarkContext, op_type: OpType
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
self.sanity_check(ctx, op_type)
|
||||||
self.to_device(self.input.device)
|
self.to_device(self.input.device)
|
||||||
|
|
||||||
_, num_tokens, _, num_slices = self.metadata()
|
_, num_tokens, _, num_slices = self.metadata(ctx, op_type)
|
||||||
|
|
||||||
# Sanity check matrix shapes.
|
# Sanity check matrix shapes.
|
||||||
i_shape, lw_shape, o_shape = (
|
i_shape, lw_shape, o_shape = (
|
||||||
@ -519,11 +742,13 @@ class BenchmarkTensors:
|
|||||||
"no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
|
"no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
|
||||||
}
|
}
|
||||||
|
|
||||||
def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
|
def as_lora_expand_kwargs(
|
||||||
self.sanity_check()
|
self, ctx: BenchmarkContext, op_type: OpType, add_inputs: bool
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
self.sanity_check(ctx, op_type)
|
||||||
self.to_device(self.input.device)
|
self.to_device(self.input.device)
|
||||||
|
|
||||||
_, num_tokens, _, num_slices = self.metadata()
|
_, num_tokens, _, num_slices = self.metadata(ctx, op_type)
|
||||||
|
|
||||||
# Sanity check matrix shapes.
|
# Sanity check matrix shapes.
|
||||||
i_shape, lw_shape, o_shape = (
|
i_shape, lw_shape, o_shape = (
|
||||||
@ -560,22 +785,177 @@ class BenchmarkTensors:
|
|||||||
"no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
|
"no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
|
||||||
}
|
}
|
||||||
|
|
||||||
def bench_fn_kwargs(
|
def as_fused_moe_lora_shrink_kwargs(
|
||||||
self, op_type: OpType, add_inputs: Optional[bool] = None
|
self, ctx: BenchmarkContext, op_type: OpType
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
if op_type.is_shrink_fn():
|
self.sanity_check(ctx, op_type)
|
||||||
|
self.to_device(self.input.device)
|
||||||
|
|
||||||
|
_, num_tokens, _, num_slices = self.metadata(ctx, op_type)
|
||||||
|
|
||||||
|
# Sanity check matrix shapes.
|
||||||
|
i_shape, lw_shape, o_shape = (
|
||||||
|
self.input.shape,
|
||||||
|
self.lora_weights_lst[0].shape,
|
||||||
|
self.output.shape,
|
||||||
|
)
|
||||||
|
# Expected input shape : [num_tokens, hidden_size] for gate_up
|
||||||
|
# Expected input shape : [top_k_num * num_tokens, hidden_size] for down
|
||||||
|
assert len(i_shape) == 2
|
||||||
|
assert i_shape[0] == num_tokens
|
||||||
|
hidden_size = i_shape[1]
|
||||||
|
# Expected lora weight shape [max_lora, num_experts, lora_rank, hidden_size]
|
||||||
|
assert len(lw_shape) == 4
|
||||||
|
assert lw_shape[-1] == hidden_size
|
||||||
|
lora_rank = lw_shape[-2]
|
||||||
|
# Expected output shape : [num_slices, num_tokens, top_k_num, lora_rank]
|
||||||
|
assert len(o_shape) == 4
|
||||||
|
assert (
|
||||||
|
o_shape
|
||||||
|
== (num_slices, num_tokens // ctx.top_k_num, ctx.top_k_num, lora_rank)
|
||||||
|
if op_type in [OpType.FUSED_MOE_LORA_DOWN_SHRINK]
|
||||||
|
else o_shape == (num_slices, num_tokens, ctx.top_k_num, lora_rank)
|
||||||
|
)
|
||||||
|
kernel_config = get_lora_op_configs(
|
||||||
|
op_type.name.lower(),
|
||||||
|
max_loras=lw_shape[0],
|
||||||
|
batch=num_tokens,
|
||||||
|
hidden_size=hidden_size,
|
||||||
|
rank=lora_rank,
|
||||||
|
num_slices=num_slices,
|
||||||
|
add_inputs=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
(topk_weights, sorted_token_ids, expert_ids, num_tokens_post_padded) = (
|
||||||
|
self.fused_moe_lora_data_prepare(
|
||||||
|
block_size=kernel_config["BLOCK_SIZE_M"],
|
||||||
|
token_lora_mapping=self.lora_kernel_meta.token_lora_mapping,
|
||||||
|
ctx=ctx,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"qcurr_hidden_states": self.input,
|
||||||
|
"lora_a_stacked": self.lora_weights_lst,
|
||||||
|
"a_intermediate_cache1": self.output,
|
||||||
|
"topk_weights": topk_weights,
|
||||||
|
"sorted_token_ids": sorted_token_ids,
|
||||||
|
"expert_ids": expert_ids,
|
||||||
|
"num_tokens_post_padded": num_tokens_post_padded,
|
||||||
|
"top_k_num": ctx.top_k_num,
|
||||||
|
"device": self.input.device,
|
||||||
|
"N": lora_rank,
|
||||||
|
"M": topk_weights.shape[0],
|
||||||
|
"EM": sorted_token_ids.shape[1],
|
||||||
|
"K": self.input.shape[1],
|
||||||
|
"num_tokens": num_tokens,
|
||||||
|
"num_experts": ctx.num_experts,
|
||||||
|
"num_slices": num_slices,
|
||||||
|
"shrink_block_size_m": kernel_config["BLOCK_SIZE_M"],
|
||||||
|
"shrink_block_size_n": kernel_config["BLOCK_SIZE_N"],
|
||||||
|
"shrink_block_size_k": kernel_config["BLOCK_SIZE_K"],
|
||||||
|
"shrink_group_size_m": kernel_config["GROUP_SIZE_M"],
|
||||||
|
"shrink_num_warps": kernel_config["NUM_WARPS"],
|
||||||
|
"shrink_num_stages": kernel_config["NUM_STAGES"],
|
||||||
|
"shrink_split_k": kernel_config.get("SPLIT_K", 1),
|
||||||
|
"mul_routed_weight": op_type.is_fused_moe_lora_down_fn(),
|
||||||
|
}
|
||||||
|
|
||||||
|
def as_fused_moe_lora_expand_kwargs(
|
||||||
|
self, ctx: BenchmarkContext, op_type: OpType
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
self.sanity_check(ctx, op_type)
|
||||||
|
self.to_device(self.input.device)
|
||||||
|
|
||||||
|
_, num_tokens, _, num_slices = self.metadata(ctx, op_type)
|
||||||
|
|
||||||
|
# Sanity check matrix shapes.
|
||||||
|
i_shape, lw_shape, o_shape = (
|
||||||
|
self.input.shape,
|
||||||
|
self.lora_weights_lst[0].shape,
|
||||||
|
self.output.shape,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Expected input shape : [num_slices, num_tokens, top_k_num, lora_rank]
|
||||||
|
assert len(i_shape) == 4
|
||||||
|
assert i_shape[0] == num_slices
|
||||||
|
assert i_shape[1] == num_tokens
|
||||||
|
lora_rank = i_shape[-1]
|
||||||
|
# Expected lora weight shape : [num_loras, num_experts, hidden_size, lora_rank]
|
||||||
|
assert len(lw_shape) == 4
|
||||||
|
assert lw_shape[-1] == lora_rank
|
||||||
|
hidden_size = lw_shape[-2]
|
||||||
|
# Expected output shape : [num_tokens, top_k_num, hidden_size * num_slices]
|
||||||
|
assert len(o_shape) == 3
|
||||||
|
assert o_shape == (num_tokens, ctx.top_k_num, hidden_size * num_slices)
|
||||||
|
|
||||||
|
kernel_config = get_lora_op_configs(
|
||||||
|
op_type.name.lower(),
|
||||||
|
max_loras=lw_shape[0],
|
||||||
|
batch=num_tokens,
|
||||||
|
hidden_size=hidden_size,
|
||||||
|
rank=lora_rank,
|
||||||
|
num_slices=num_slices,
|
||||||
|
add_inputs=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
(topk_weights, sorted_token_ids, expert_ids, num_tokens_post_padded) = (
|
||||||
|
self.fused_moe_lora_data_prepare(
|
||||||
|
block_size=kernel_config["BLOCK_SIZE_M"],
|
||||||
|
token_lora_mapping=self.lora_kernel_meta.token_lora_mapping,
|
||||||
|
ctx=ctx,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"a_intermediate_cache1": self.input,
|
||||||
|
"lora_b_stacked": self.lora_weights_lst,
|
||||||
|
"output": self.output,
|
||||||
|
"topk_weights": topk_weights,
|
||||||
|
"sorted_token_ids": sorted_token_ids,
|
||||||
|
"expert_ids": expert_ids,
|
||||||
|
"num_tokens_post_padded": num_tokens_post_padded,
|
||||||
|
"top_k_num": ctx.top_k_num,
|
||||||
|
"device": self.input.device,
|
||||||
|
"N": lora_rank,
|
||||||
|
"M": topk_weights.shape[0],
|
||||||
|
"EM": sorted_token_ids.shape[1],
|
||||||
|
"K": self.input.shape[1],
|
||||||
|
"num_tokens": num_tokens,
|
||||||
|
"num_experts": ctx.num_experts,
|
||||||
|
"num_slices": num_slices,
|
||||||
|
"max_lora_rank": lora_rank,
|
||||||
|
"w1_output_dim_size": lw_shape[2],
|
||||||
|
"expand_block_size_m": kernel_config["BLOCK_SIZE_M"],
|
||||||
|
"expand_block_size_n": kernel_config["BLOCK_SIZE_N"],
|
||||||
|
"expand_block_size_k": kernel_config["BLOCK_SIZE_K"],
|
||||||
|
"expand_group_size_m": kernel_config["GROUP_SIZE_M"],
|
||||||
|
"expand_num_warps": kernel_config["NUM_WARPS"],
|
||||||
|
"expand_num_stages": kernel_config["NUM_STAGES"],
|
||||||
|
"expand_split_k": kernel_config.get("SPLIT_K", 1),
|
||||||
|
"mul_routed_weight": op_type.is_fused_moe_lora_down_fn(),
|
||||||
|
}
|
||||||
|
|
||||||
|
def bench_fn_kwargs(
|
||||||
|
self, ctx: BenchmarkContext, op_type: OpType, add_inputs: bool | None = None
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
if op_type.is_shrink_fn() or op_type.is_fused_moe_lora_fn():
|
||||||
assert add_inputs is None
|
assert add_inputs is None
|
||||||
else:
|
else:
|
||||||
assert add_inputs is not None
|
assert add_inputs is not None
|
||||||
|
|
||||||
if op_type == OpType.LORA_SHRINK:
|
if op_type == OpType.LORA_SHRINK:
|
||||||
return self.as_lora_shrink_kwargs()
|
return self.as_lora_shrink_kwargs(ctx, op_type)
|
||||||
if op_type == OpType.LORA_EXPAND:
|
if op_type == OpType.LORA_EXPAND:
|
||||||
return self.as_lora_expand_kwargs(add_inputs)
|
return self.as_lora_expand_kwargs(ctx, op_type, add_inputs)
|
||||||
|
if op_type.is_fused_moe_lora_shrink_fn():
|
||||||
|
return self.as_fused_moe_lora_shrink_kwargs(ctx, op_type)
|
||||||
|
if op_type.is_fused_moe_lora_expand_fn():
|
||||||
|
return self.as_fused_moe_lora_expand_kwargs(ctx, op_type)
|
||||||
raise ValueError(f"Unrecognized optype {self}")
|
raise ValueError(f"Unrecognized optype {self}")
|
||||||
|
|
||||||
def test_correctness(
|
def test_correctness(
|
||||||
self, op_type: OpType, expand_fn_add_inputs: Optional[bool]
|
self, op_type: OpType, expand_fn_add_inputs: bool | None
|
||||||
) -> bool:
|
) -> bool:
|
||||||
"""
|
"""
|
||||||
Test correctness of op_type implementation against a grouped gemm
|
Test correctness of op_type implementation against a grouped gemm
|
||||||
@ -611,12 +991,12 @@ def bench_optype(
|
|||||||
ctx: BenchmarkContext,
|
ctx: BenchmarkContext,
|
||||||
arg_pool_size: int,
|
arg_pool_size: int,
|
||||||
op_type: OpType,
|
op_type: OpType,
|
||||||
cuda_graph_nops: Optional[int] = None,
|
cuda_graph_nops: int | None = None,
|
||||||
expand_fn_add_inputs: Optional[bool] = None,
|
expand_fn_add_inputs: bool | None = None,
|
||||||
test_correctness: bool = False,
|
test_correctness: bool = False,
|
||||||
) -> TMeasurement:
|
) -> TMeasurement:
|
||||||
assert arg_pool_size >= 1
|
assert arg_pool_size >= 1
|
||||||
if op_type.is_shrink_fn():
|
if op_type.is_shrink_fn() or op_type.is_fused_moe_lora_fn():
|
||||||
assert expand_fn_add_inputs is None
|
assert expand_fn_add_inputs is None
|
||||||
else:
|
else:
|
||||||
assert expand_fn_add_inputs is not None
|
assert expand_fn_add_inputs is not None
|
||||||
@ -626,23 +1006,30 @@ def bench_optype(
|
|||||||
BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size)
|
BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size)
|
||||||
]
|
]
|
||||||
for bt in bench_tensors:
|
for bt in bench_tensors:
|
||||||
bt.sanity_check()
|
bt.sanity_check(ctx, op_type)
|
||||||
|
|
||||||
# Test correctness of our implementation.
|
# Test correctness of our implementation.
|
||||||
if test_correctness:
|
if test_correctness:
|
||||||
|
assert op_type in [OpType.LORA_SHRINK, OpType.LORA_EXPAND], (
|
||||||
|
f"Correctness testing is not supported for {op_type.name}."
|
||||||
|
)
|
||||||
assert all(
|
assert all(
|
||||||
[bt.test_correctness(op_type, expand_fn_add_inputs) for bt in bench_tensors]
|
[
|
||||||
|
bt.test_correctness(ctx, op_type, expand_fn_add_inputs)
|
||||||
|
for bt in bench_tensors
|
||||||
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
# BenchmarkTensors -> dict (kwargs)
|
# BenchmarkTensors -> dict (kwargs)
|
||||||
kwargs_list = [
|
kwargs_list = [
|
||||||
bt.bench_fn_kwargs(op_type, add_inputs=expand_fn_add_inputs)
|
bt.bench_fn_kwargs(ctx, op_type, add_inputs=expand_fn_add_inputs)
|
||||||
for bt in bench_tensors
|
for bt in bench_tensors
|
||||||
]
|
]
|
||||||
|
|
||||||
# Clear LoRA optimization hash-maps.
|
# Clear LoRA optimization hash-maps.
|
||||||
_LORA_A_PTR_DICT.clear()
|
_LORA_A_PTR_DICT.clear()
|
||||||
_LORA_B_PTR_DICT.clear()
|
_LORA_B_PTR_DICT.clear()
|
||||||
|
_LORA_PTR_DICT.clear()
|
||||||
# Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
|
# Run bench function so that _LORA_A_PTR_DICT and _LORA_B_PTR_DICT are set up
|
||||||
for kwargs in kwargs_list:
|
for kwargs in kwargs_list:
|
||||||
op_type.bench_fn()(**kwargs)
|
op_type.bench_fn()(**kwargs)
|
||||||
@ -679,7 +1066,7 @@ def bench_torch_mm(
|
|||||||
ctx: BenchmarkContext,
|
ctx: BenchmarkContext,
|
||||||
arg_pool_size: int,
|
arg_pool_size: int,
|
||||||
op_type: OpType,
|
op_type: OpType,
|
||||||
cuda_graph_nops: Optional[int] = None,
|
cuda_graph_nops: int | None = None,
|
||||||
) -> TMeasurement:
|
) -> TMeasurement:
|
||||||
"""
|
"""
|
||||||
Benchmark basic torch.mm as a roofline.
|
Benchmark basic torch.mm as a roofline.
|
||||||
@ -744,7 +1131,7 @@ def use_cuda_graph_recommendation() -> str:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None):
|
def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None):
|
||||||
compare = TBenchmark.Compare(timers)
|
compare = TBenchmark.Compare(timers)
|
||||||
compare.print()
|
compare.print()
|
||||||
|
|
||||||
@ -792,7 +1179,9 @@ def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]):
|
|||||||
|
|
||||||
# Benchmark bench_op
|
# Benchmark bench_op
|
||||||
expand_fn_add_inputs = (
|
expand_fn_add_inputs = (
|
||||||
[None] if bench_op.is_shrink_fn() else args.expand_fn_add_inputs
|
[None]
|
||||||
|
if bench_op.is_shrink_fn() or bench_op.is_fused_moe_lora_fn()
|
||||||
|
else args.expand_fn_add_inputs
|
||||||
)
|
)
|
||||||
for add_input_arg in expand_fn_add_inputs:
|
for add_input_arg in expand_fn_add_inputs:
|
||||||
seq_len_timers.append(
|
seq_len_timers.append(
|
||||||
@ -830,12 +1219,22 @@ def as_benchmark_contexts(
|
|||||||
hidden_sizes: list[int], lora_ranks: list[int], args: argparse.Namespace
|
hidden_sizes: list[int], lora_ranks: list[int], args: argparse.Namespace
|
||||||
) -> list[BenchmarkContext]:
|
) -> list[BenchmarkContext]:
|
||||||
ctxs: list[BenchmarkContext] = []
|
ctxs: list[BenchmarkContext] = []
|
||||||
for batch_size, hidden_size, lora_rank, num_loras, sort_by_lora_id in product( # noqa
|
for (
|
||||||
|
batch_size,
|
||||||
|
hidden_size,
|
||||||
|
lora_rank,
|
||||||
|
num_loras,
|
||||||
|
sort_by_lora_id,
|
||||||
|
top_k_num,
|
||||||
|
num_experts,
|
||||||
|
) in product( # noqa
|
||||||
args.batch_sizes,
|
args.batch_sizes,
|
||||||
list(hidden_sizes),
|
list(hidden_sizes),
|
||||||
lora_ranks,
|
lora_ranks,
|
||||||
args.num_loras,
|
args.num_loras,
|
||||||
args.sort_by_lora_id,
|
args.sort_by_lora_id,
|
||||||
|
args.top_k_nums,
|
||||||
|
args.num_experts,
|
||||||
):
|
):
|
||||||
ctxs.append(
|
ctxs.append(
|
||||||
BenchmarkContext(
|
BenchmarkContext(
|
||||||
@ -850,6 +1249,8 @@ def as_benchmark_contexts(
|
|||||||
seq_length=None,
|
seq_length=None,
|
||||||
sort_by_lora_id=sort_by_lora_id,
|
sort_by_lora_id=sort_by_lora_id,
|
||||||
dtype=args.dtype,
|
dtype=args.dtype,
|
||||||
|
top_k_num=top_k_num,
|
||||||
|
num_experts=num_experts,
|
||||||
# To be filled based on the OpType to benchmark
|
# To be filled based on the OpType to benchmark
|
||||||
num_slices=None,
|
num_slices=None,
|
||||||
)
|
)
|
||||||
@ -1011,6 +1412,22 @@ if __name__ == "__main__":
|
|||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
p.add_argument(
|
||||||
|
"--top-k-nums",
|
||||||
|
nargs="+",
|
||||||
|
type=int,
|
||||||
|
default=DEFAULT_TOP_K_NUMS,
|
||||||
|
help="Top-K values for MoE LoRA operations",
|
||||||
|
)
|
||||||
|
|
||||||
|
p.add_argument(
|
||||||
|
"--num-experts",
|
||||||
|
nargs="+",
|
||||||
|
type=int,
|
||||||
|
default=DEFAULT_NUM_EXPERTS,
|
||||||
|
help="Number of experts for MoE LoRA operations",
|
||||||
|
)
|
||||||
|
|
||||||
parser = FlexibleArgumentParser(
|
parser = FlexibleArgumentParser(
|
||||||
description=f"""
|
description=f"""
|
||||||
Benchmark LoRA kernels:
|
Benchmark LoRA kernels:
|
||||||
|
|||||||
@ -8,10 +8,9 @@ import math
|
|||||||
import os
|
import os
|
||||||
import pickle as pkl
|
import pickle as pkl
|
||||||
import time
|
import time
|
||||||
from collections.abc import Iterable
|
from collections.abc import Callable, Iterable
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from itertools import product
|
from itertools import product
|
||||||
from typing import Callable, Optional
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import torch
|
import torch
|
||||||
@ -34,7 +33,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
|||||||
quantize_weights,
|
quantize_weights,
|
||||||
)
|
)
|
||||||
from vllm.scalar_type import ScalarType, scalar_types
|
from vllm.scalar_type import ScalarType, scalar_types
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"]
|
DEFAULT_MODELS = ["meta-llama/Llama-3-8b", "meta-llama/Llama-2-70b-hf"]
|
||||||
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024]
|
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024]
|
||||||
@ -63,23 +62,23 @@ class BenchmarkTensors:
|
|||||||
a: torch.Tensor
|
a: torch.Tensor
|
||||||
|
|
||||||
w_q: torch.Tensor
|
w_q: torch.Tensor
|
||||||
group_size: Optional[int]
|
group_size: int | None
|
||||||
wtype: ScalarType
|
wtype: ScalarType
|
||||||
w_g_s: torch.Tensor
|
w_g_s: torch.Tensor
|
||||||
w_g_zp: Optional[torch.Tensor]
|
w_g_zp: torch.Tensor | None
|
||||||
w_ch_s: Optional[torch.Tensor]
|
w_ch_s: torch.Tensor | None
|
||||||
w_tok_s: Optional[torch.Tensor]
|
w_tok_s: torch.Tensor | None
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TypeConfig:
|
class TypeConfig:
|
||||||
act_type: torch.dtype
|
act_type: torch.dtype
|
||||||
weight_type: ScalarType
|
weight_type: ScalarType
|
||||||
output_type: Optional[torch.dtype]
|
output_type: torch.dtype | None
|
||||||
group_scale_type: Optional[torch.dtype]
|
group_scale_type: torch.dtype | None
|
||||||
group_zero_type: Optional[torch.dtype]
|
group_zero_type: torch.dtype | None
|
||||||
channel_scale_type: Optional[torch.dtype]
|
channel_scale_type: torch.dtype | None
|
||||||
token_scale_type: Optional[torch.dtype]
|
token_scale_type: torch.dtype | None
|
||||||
|
|
||||||
|
|
||||||
def rand_data(shape, dtype=torch.float16, scale=1):
|
def rand_data(shape, dtype=torch.float16, scale=1):
|
||||||
@ -93,8 +92,8 @@ def quantize_and_pack(
|
|||||||
atype: torch.dtype,
|
atype: torch.dtype,
|
||||||
w: torch.Tensor,
|
w: torch.Tensor,
|
||||||
wtype: ScalarType,
|
wtype: ScalarType,
|
||||||
stype: Optional[torch.dtype],
|
stype: torch.dtype | None,
|
||||||
group_size: Optional[int],
|
group_size: int | None,
|
||||||
zero_points: bool = False,
|
zero_points: bool = False,
|
||||||
):
|
):
|
||||||
assert wtype.is_integer(), "TODO: support floating point weights"
|
assert wtype.is_integer(), "TODO: support floating point weights"
|
||||||
@ -113,7 +112,7 @@ def quantize_and_pack(
|
|||||||
|
|
||||||
|
|
||||||
def create_bench_tensors(
|
def create_bench_tensors(
|
||||||
shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int]
|
shape: tuple[int, int, int], types: TypeConfig, group_size: int | None
|
||||||
) -> list[BenchmarkTensors]:
|
) -> list[BenchmarkTensors]:
|
||||||
m, n, k = shape
|
m, n, k = shape
|
||||||
|
|
||||||
@ -331,8 +330,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable])
|
|||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
|
_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None
|
||||||
_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
|
_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None
|
||||||
|
|
||||||
|
|
||||||
def bench(
|
def bench(
|
||||||
|
|||||||
@ -44,7 +44,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
|||||||
sort_weights,
|
sort_weights,
|
||||||
)
|
)
|
||||||
from vllm.scalar_type import ScalarType, scalar_types
|
from vllm.scalar_type import ScalarType, scalar_types
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
|
DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
|
||||||
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
|
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
|
||||||
|
|||||||
@ -22,7 +22,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import *
|
|||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.transformers_utils.config import get_config
|
from vllm.transformers_utils.config import get_config
|
||||||
from vllm.triton_utils import triton
|
from vllm.triton_utils import triton
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
FP8_DTYPE = current_platform.fp8_dtype()
|
FP8_DTYPE = current_platform.fp8_dtype()
|
||||||
|
|
||||||
@ -211,7 +211,7 @@ def get_rocm_tuning_space(use_fp16):
|
|||||||
num_warps_range = [1, 2, 4, 8]
|
num_warps_range = [1, 2, 4, 8]
|
||||||
group_m_range = [1, 4, 8, 16, 32]
|
group_m_range = [1, 4, 8, 16, 32]
|
||||||
num_stage_range = [2]
|
num_stage_range = [2]
|
||||||
waves_per_eu_range = [0]
|
waves_per_eu_range = [0, 1, 2, 4]
|
||||||
matrix_instr_nonkdim_range = [16, 32] if use_fp16 else []
|
matrix_instr_nonkdim_range = [16, 32] if use_fp16 else []
|
||||||
kpack_range = [1, 2] if use_fp16 else []
|
kpack_range = [1, 2] if use_fp16 else []
|
||||||
|
|
||||||
@ -579,19 +579,23 @@ def main(args: argparse.Namespace):
|
|||||||
E = config.ffn_config.moe_num_experts
|
E = config.ffn_config.moe_num_experts
|
||||||
topk = config.ffn_config.moe_top_k
|
topk = config.ffn_config.moe_top_k
|
||||||
intermediate_size = config.ffn_config.ffn_hidden_size
|
intermediate_size = config.ffn_config.ffn_hidden_size
|
||||||
|
hidden_size = config.hidden_size
|
||||||
elif config.architectures[0] == "JambaForCausalLM":
|
elif config.architectures[0] == "JambaForCausalLM":
|
||||||
E = config.num_experts
|
E = config.num_experts
|
||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
intermediate_size = config.intermediate_size
|
intermediate_size = config.intermediate_size
|
||||||
|
hidden_size = config.hidden_size
|
||||||
elif config.architectures[0] in (
|
elif config.architectures[0] in (
|
||||||
"DeepseekV2ForCausalLM",
|
"DeepseekV2ForCausalLM",
|
||||||
"DeepseekV3ForCausalLM",
|
"DeepseekV3ForCausalLM",
|
||||||
"DeepseekV32ForCausalLM",
|
"DeepseekV32ForCausalLM",
|
||||||
"Glm4MoeForCausalLM",
|
"Glm4MoeForCausalLM",
|
||||||
|
"NemotronHForCausalLM",
|
||||||
):
|
):
|
||||||
E = config.n_routed_experts
|
E = config.n_routed_experts
|
||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
intermediate_size = config.moe_intermediate_size
|
intermediate_size = config.moe_intermediate_size
|
||||||
|
hidden_size = config.hidden_size
|
||||||
elif config.architectures[0] in (
|
elif config.architectures[0] in (
|
||||||
"Qwen2MoeForCausalLM",
|
"Qwen2MoeForCausalLM",
|
||||||
"Qwen3MoeForCausalLM",
|
"Qwen3MoeForCausalLM",
|
||||||
@ -600,10 +604,23 @@ def main(args: argparse.Namespace):
|
|||||||
E = config.num_experts
|
E = config.num_experts
|
||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
intermediate_size = config.moe_intermediate_size
|
intermediate_size = config.moe_intermediate_size
|
||||||
|
hidden_size = config.hidden_size
|
||||||
|
elif config.architectures[0] == "Qwen3VLMoeForConditionalGeneration":
|
||||||
|
text_config = config.get_text_config()
|
||||||
|
E = text_config.num_experts
|
||||||
|
topk = text_config.num_experts_per_tok
|
||||||
|
intermediate_size = text_config.moe_intermediate_size
|
||||||
|
hidden_size = text_config.hidden_size
|
||||||
elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
|
elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
|
||||||
E = config.num_experts
|
E = config.num_experts
|
||||||
topk = config.moe_topk[0]
|
topk = config.moe_topk[0]
|
||||||
intermediate_size = config.moe_intermediate_size[0]
|
intermediate_size = config.moe_intermediate_size[0]
|
||||||
|
hidden_size = config.hidden_size
|
||||||
|
elif config.architectures[0] in ["Qwen3OmniMoeForConditionalGeneration"]:
|
||||||
|
E = config.thinker_config.text_config.num_experts
|
||||||
|
topk = config.thinker_config.text_config.num_experts_per_tok
|
||||||
|
intermediate_size = config.thinker_config.text_config.moe_intermediate_size
|
||||||
|
hidden_size = config.thinker_config.text_config.hidden_size
|
||||||
else:
|
else:
|
||||||
# Support for llama4
|
# Support for llama4
|
||||||
config = config.get_text_config()
|
config = config.get_text_config()
|
||||||
@ -611,6 +628,7 @@ def main(args: argparse.Namespace):
|
|||||||
E = config.num_local_experts
|
E = config.num_local_experts
|
||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
intermediate_size = config.intermediate_size
|
intermediate_size = config.intermediate_size
|
||||||
|
hidden_size = config.hidden_size
|
||||||
enable_ep = bool(args.enable_expert_parallel)
|
enable_ep = bool(args.enable_expert_parallel)
|
||||||
if enable_ep:
|
if enable_ep:
|
||||||
ensure_divisibility(E, args.tp_size, "Number of experts")
|
ensure_divisibility(E, args.tp_size, "Number of experts")
|
||||||
@ -619,8 +637,7 @@ def main(args: argparse.Namespace):
|
|||||||
else:
|
else:
|
||||||
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
|
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
|
||||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||||
hidden_size = config.hidden_size
|
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
|
||||||
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
|
||||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||||
block_quant_shape = get_weight_block_size_safety(config)
|
block_quant_shape = get_weight_block_size_safety(config)
|
||||||
|
|||||||
@ -17,7 +17,7 @@ from vllm.model_executor.layers.fused_moe.moe_permute_unpermute import (
|
|||||||
)
|
)
|
||||||
from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
|
from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
FP8_DTYPE = current_platform.fp8_dtype()
|
FP8_DTYPE = current_platform.fp8_dtype()
|
||||||
|
|
||||||
@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
|
|||||||
topk = config.num_experts_per_tok
|
topk = config.num_experts_per_tok
|
||||||
|
|
||||||
hidden_size = config.hidden_size
|
hidden_size = config.hidden_size
|
||||||
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
dtype = torch.float16 if current_platform.is_rocm() else config.dtype
|
||||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||||
use_customized_permute = args.use_customized_permute
|
use_customized_permute = args.use_customized_permute
|
||||||
|
|||||||
@ -39,7 +39,7 @@ import torch
|
|||||||
from vllm.model_executor.layers.rotary_embedding import get_rope
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.transformers_utils.config import get_config
|
from vllm.transformers_utils.config import get_config
|
||||||
from vllm.utils import FlexibleArgumentParser
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
|
||||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
|
|
||||||
|
|||||||
@ -3,16 +3,15 @@
|
|||||||
|
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm import _custom_ops as ops
|
from vllm import _custom_ops as ops
|
||||||
from vllm.logger import init_logger
|
from vllm.logger import init_logger
|
||||||
from vllm.platforms import current_platform
|
from vllm.platforms import current_platform
|
||||||
from vllm.utils import (
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||||
|
from vllm.utils.torch_utils import (
|
||||||
STR_DTYPE_TO_TORCH_DTYPE,
|
STR_DTYPE_TO_TORCH_DTYPE,
|
||||||
FlexibleArgumentParser,
|
|
||||||
create_kv_caches_with_random,
|
create_kv_caches_with_random,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -37,7 +36,7 @@ def main(
|
|||||||
seed: int,
|
seed: int,
|
||||||
do_profile: bool,
|
do_profile: bool,
|
||||||
device: str = "cuda",
|
device: str = "cuda",
|
||||||
kv_cache_dtype: Optional[str] = None,
|
kv_cache_dtype: str | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
current_platform.seed_everything(seed)
|
current_platform.seed_everything(seed)
|
||||||
|
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user