mirror of
				https://github.com/vllm-project/vllm.git
				synced 2025-11-04 17:34:34 +08:00 
			
		
		
		
	Compare commits
	
		
			1282 Commits
		
	
	
		
			v0.10.2rc2
			...
			amd_dev
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| c7021f1270 | |||
| 2072fdc044 | |||
| 6eefda507a | |||
| a0003b56b0 | |||
| 5beacce2ea | |||
| 8669c69afa | |||
| 1651003c35 | |||
| 1cb8c6c5fe | |||
| e05a6754a8 | |||
| 084a9dae80 | |||
| c9461e05a4 | |||
| 4dfdb821c8 | |||
| 58fab50d82 | |||
| db6f28d898 | |||
| 14e2f1231e | |||
| 7c4767f1eb | |||
| 9771e0b432 | |||
| 980de31ca0 | |||
| 1c160841ea | |||
| 4ca13a8667 | |||
| 675aa2ec64 | |||
| 3ae082c373 | |||
| 49c00fe304 | |||
| 141d3b9fc5 | |||
| abf3db40ef | |||
| 8e4ca4d14e | |||
| 1a0f4defb7 | |||
| 843af7f7fc | |||
| 1f633b8632 | |||
| a4c29e6e82 | |||
| 8f18feb191 | |||
| ed540d6d4c | |||
| f6027b2855 | |||
| ab3e80042e | |||
| ceacedc1f9 | |||
| bfa59be8f1 | |||
| 265ecb05fb | |||
| 09a7e6f617 | |||
| 6c2eef5a5d | |||
| 19748806f0 | |||
| 4a8a567e16 | |||
| 344a0017c0 | |||
| becb7de40b | |||
| 250fb1b8ea | |||
| 647214f3d5 | |||
| ddeec11ba9 | |||
| 86ed77022d | |||
| aa1356ec53 | |||
| ecc3c0940a | |||
| ba09652de2 | |||
| bd66b8529b | |||
| 6c728f7771 | |||
| 80e9452984 | |||
| c3a2c6ac5f | |||
| 72f431e709 | |||
| be4445072c | |||
| f381cf2302 | |||
| 5ff5d94e77 | |||
| f95da13c3d | |||
| aef368aa08 | |||
| 5f6cbf60d6 | |||
| 3ada34f9cb | |||
| 0eb8f2b880 | |||
| 163965d183 | |||
| a03cf9bc70 | |||
| 352c0c8a28 | |||
| bfe0b4bd2a | |||
| 58fbbcb2f5 | |||
| 87778d5f00 | |||
| f9e7ad5400 | |||
| 4d0f266113 | |||
| e93ff6c8b9 | |||
| 1c691f4a71 | |||
| 9fce7bee74 | |||
| b63f2143f8 | |||
| f32bf7582e | |||
| 8a81d776ce | |||
| f6fdacd82c | |||
| d31f7844f8 | |||
| 7a6c8c3fa1 | |||
| 221bf72577 | |||
| b3aba04e5a | |||
| 8a297115e2 | |||
| 191eed0bb9 | |||
| fb860670da | |||
| 83e760c57d | |||
| c2bba69065 | |||
| e133d6d218 | |||
| a1946c9f61 | |||
| 9f020f4f31 | |||
| 3b45075206 | |||
| 168e578efc | |||
| 6ac5e06f7c | |||
| 5c2acb270a | |||
| b26b70bec4 | |||
| ab4be40fc5 | |||
| 245e4f2c01 | |||
| 1d165d6d85 | |||
| 83004020fd | |||
| 12e21701e7 | |||
| 30a33b92ee | |||
| 7c572544e4 | |||
| c312320764 | |||
| c981f0ea78 | |||
| 6367bde739 | |||
| f50cc221ea | |||
| acedc74b1a | |||
| d29483b58a | |||
| 950cf9e58e | |||
| 3125d79950 | |||
| e33ee23ee3 | |||
| b10c64c834 | |||
| 0925b28a8e | |||
| 99722d5f0e | |||
| 4c91a28e30 | |||
| b038d9c40c | |||
| 2ba60ec7fe | |||
| bd7157a071 | |||
| be429d0cfd | |||
| c253745eb8 | |||
| daec4d2624 | |||
| 6c9fdbf725 | |||
| 483ea64611 | |||
| e20eba753b | |||
| bbc1b29665 | |||
| acb1bfa601 | |||
| 75c7ad9918 | |||
| 5550ff9c25 | |||
| 3aeb19a39e | |||
| 8c017b3490 | |||
| 9c2c2287a0 | |||
| fec2b341ad | |||
| 87bc0c492f | |||
| fe3b9372ad | |||
| bde9e2272a | |||
| 08405609cc | |||
| ab81379ea6 | |||
| 4ffd6e8942 | |||
| 965c5f4914 | |||
| 4d055ef465 | |||
| 17c540a993 | |||
| 4d4d6bad19 | |||
| 11ae016bd7 | |||
| 41d3071918 | |||
| fb5e10d3fb | |||
| b2f78cbad4 | |||
| 23583ee28c | |||
| 01c977e96d | |||
| b3dda72c23 | |||
| fb0571b077 | |||
| 2ed8b6b3d0 | |||
| 013abde6ef | |||
| a5464dcf92 | |||
| ac3ed5a815 | |||
| e6ba2000ae | |||
| aa255ff55a | |||
| 7bb736d00e | |||
| 9f4e30904b | |||
| 5afd3276df | |||
| 43721bc67f | |||
| 02d709a6f1 | |||
| 4a510ab487 | |||
| 314fa8abbf | |||
| 334535b6fb | |||
| dcbb3f1871 | |||
| 00417f4e44 | |||
| ed344f4116 | |||
| e51928793e | |||
| d2740fafbf | |||
| 17838e50ef | |||
| 44c8555621 | |||
| f7d318de2b | |||
| 76f0d05bc6 | |||
| 7d8975de84 | |||
| 785d8b6410 | |||
| f6cdc9a02f | |||
| 509cdc0370 | |||
| 9b6504c307 | |||
| e19b16dde6 | |||
| 582f2c6be7 | |||
| f8a0acbdbe | |||
| 1317034379 | |||
| 0ecc553ee6 | |||
| f96bc3649c | |||
| 938c43ea7f | |||
| 0a9ef0cfce | |||
| e5b438a247 | |||
| 0b99f5d302 | |||
| 1f491aa0c8 | |||
| de92d916fe | |||
| a1063628a4 | |||
| d796375258 | |||
| 14f8456344 | |||
| 4794c2bd92 | |||
| d3cbaa08dc | |||
| 828523ad8e | |||
| 136a17fe6e | |||
| f57438338d | |||
| 5d598680e3 | |||
| 8f4b313c37 | |||
| f93e348010 | |||
| f54f85129e | |||
| d4d1a6024f | |||
| db1764e4e0 | |||
| 7f83b4ee8e | |||
| 5c3bae1a6a | |||
| 5210dc3940 | |||
| 650b51f9f9 | |||
| 6256697997 | |||
| 71557a5f7c | |||
| f3c378ffa7 | |||
| f5ed68ef63 | |||
| efdef57b1f | |||
| b8a4572157 | |||
| 302ef403a2 | |||
| 8865da157b | |||
| f0862eae43 | |||
| 8c851f6d04 | |||
| 7cfa420f49 | |||
| a27b288e4a | |||
| e471d7ca7e | |||
| c43ca8259e | |||
| 85a65e7f51 | |||
| a2986b3e33 | |||
| 96b9aa5aa0 | |||
| e66d787bce | |||
| bfad142e25 | |||
| 9354660036 | |||
| 07ca70af8d | |||
| 2dcd12d357 | |||
| 579d2e5458 | |||
| 0512c04aee | |||
| 7e0ef4084a | |||
| 4aed506b65 | |||
| a86b4c58e8 | |||
| ff4810ba73 | |||
| 9d6964926e | |||
| 0e65818910 | |||
| 380f17527c | |||
| b92ab3deda | |||
| acaa2c0a4a | |||
| 82af928c41 | |||
| 87efc681db | |||
| c3a722fcb2 | |||
| aba48f7db1 | |||
| 04b5f9802d | |||
| efc8f7d814 | |||
| 6d87a2838c | |||
| e6cdbd6792 | |||
| df850c4912 | |||
| 720394de43 | |||
| 88a49745af | |||
| ca683a2a72 | |||
| e9f1b8c9e9 | |||
| ea97940d6c | |||
| fdd32750f0 | |||
| c715ba3735 | |||
| 9c4cb68339 | |||
| 780eb03d9b | |||
| ef9676a1f1 | |||
| 70b1b330e1 | |||
| d1d063a588 | |||
| 7e6edb1469 | |||
| 74704d4553 | |||
| d2f816d6ff | |||
| 577d498212 | |||
| fd85c9f426 | |||
| d32c611f45 | |||
| 01ad27faff | |||
| 481545b397 | |||
| d3cc8427c0 | |||
| 4821ac1b4d | |||
| 4497c8f821 | |||
| 2e36cdbe2b | |||
| fe3edb4cf0 | |||
| 29350922c6 | |||
| 8ae169286f | |||
| 8a0af6a561 | |||
| cfded80793 | |||
| b59dd19b55 | |||
| 3e051bda82 | |||
| 8317f72354 | |||
| d8bebb008a | |||
| 35bc22f23c | |||
| fa96fb9c70 | |||
| e3fdb627d9 | |||
| 7200a21cd1 | |||
| 577c72a227 | |||
| 314285d4f2 | |||
| d2a7938582 | |||
| 89342ce4c0 | |||
| f89f599395 | |||
| e251e457c5 | |||
| afc47e4de7 | |||
| e3b90c1ba2 | |||
| 134f70b3ed | |||
| a1b2d658ee | |||
| 5c7fe25491 | |||
| 53c9a7cee2 | |||
| 0d21b9b51e | |||
| 10214b6935 | |||
| 4a61950f4d | |||
| 3263799056 | |||
| 8e67b2557a | |||
| 4073c82c4e | |||
| 767c3ab869 | |||
| 4f207c7174 | |||
| 782505ed8e | |||
| 98f30b8cba | |||
| 3cd36660f7 | |||
| 46ad73955a | |||
| 41f3884438 | |||
| 60e419c1ee | |||
| 7ef6052804 | |||
| 4fca1a1bd2 | |||
| a6049be73c | |||
| 18ed7746ea | |||
| 8fcaaf6a16 | |||
| 9bb38130cb | |||
| b91d8db873 | |||
| 045b396d09 | |||
| 76852017ea | |||
| 82e64c7a20 | |||
| 4ca204055e | |||
| c5c8f5ea59 | |||
| 01653a917b | |||
| 0cd103e7cb | |||
| 5be7ca1b99 | |||
| f0a30a067b | |||
| 9d6cff3ede | |||
| a25f2adee9 | |||
| d0bed837ac | |||
| f7ee69868a | |||
| d2a71530c1 | |||
| 086609de64 | |||
| 727144bed1 | |||
| 55392bc879 | |||
| ddaff2938e | |||
| 27ed39a347 | |||
| 8f8474fbe3 | |||
| be067861c6 | |||
| 5bc26c438d | |||
| eef921f45e | |||
| e317414ce1 | |||
| 949cb0170d | |||
| e94cfd51da | |||
| 7c12763b24 | |||
| 3b780a4bbb | |||
| 30f78af147 | |||
| 19a9b169bf | |||
| 96ad65b7fe | |||
| 8d2b8c0ff2 | |||
| b2155ed317 | |||
| 910abdbd08 | |||
| cddce79fda | |||
| e519281920 | |||
| 7b03584de8 | |||
| ae9d0e7da5 | |||
| 0e67102d93 | |||
| f4ba2061cf | |||
| 1e6848a65d | |||
| 67661375fa | |||
| 213b64452a | |||
| 784c231151 | |||
| 606b00e80f | |||
| 720d3cd0f0 | |||
| ab196edefb | |||
| 3ee202ea1e | |||
| ad430a67ca | |||
| 6f0f570c43 | |||
| b545a0b207 | |||
| 29255cfc3b | |||
| da4455609d | |||
| aafb99a4d4 | |||
| 757fa4a4da | |||
| c6187f55f7 | |||
| 8983e0216f | |||
| 1ee35382cb | |||
| 6e783bc54b | |||
| c9d33c60dc | |||
| 2e54db4d2b | |||
| 44f633dba1 | |||
| a462331e36 | |||
| 4069db3f2e | |||
| 0d37450eb7 | |||
| 47e66c24e2 | |||
| 3b736e1c38 | |||
| 2c1c7dfb35 | |||
| e246ad6f0c | |||
| 5728da11ea | |||
| 92be3f3517 | |||
| d1ddf340c8 | |||
| ec10fd0abc | |||
| 0426e3c5e1 | |||
| 4bdf7ac593 | |||
| dc7976dd9f | |||
| e4791438ed | |||
| e6e898f95d | |||
| ddcbc2f334 | |||
| a83ff278d6 | |||
| cf4cd6c24f | |||
| b960441812 | |||
| 1317028aa8 | |||
| 5e49c3e777 | |||
| 0d7c3cb51d | |||
| 1b2c440cd6 | |||
| 0f29dca988 | |||
| d24cf322e1 | |||
| d17f0fbf30 | |||
| 43ab8cfaa5 | |||
| de253d63b7 | |||
| 8bd696fa53 | |||
| bb6d8c21f9 | |||
| ebf6ef1a9b | |||
| 0c52d6ef81 | |||
| 467a4f98f1 | |||
| e614ab7806 | |||
| 2a03f93de9 | |||
| da364615fc | |||
| f08919b7d1 | |||
| 93f2c0aa08 | |||
| 4ebc9108a7 | |||
| e1ba235668 | |||
| b82f4307c9 | |||
| 76879cc160 | |||
| b25d7b5657 | |||
| e09d1753ec | |||
| 4ba8875749 | |||
| 6273fe8d3d | |||
| 9fb3ae4e6f | |||
| 76afe4edf8 | |||
| c1b06fc182 | |||
| 241b4cfe66 | |||
| 9fc983c707 | |||
| 2f99f2f506 | |||
| 338b1bf04f | |||
| e39dc46f8f | |||
| 10c75b5439 | |||
| f9582fd8f4 | |||
| f377333bd7 | |||
| f8607863d8 | |||
| 335b28f7d1 | |||
| 5e65d6b2ad | |||
| 0d4f48fa10 | |||
| 127c8b782a | |||
| cd9890544b | |||
| 067da2d1df | |||
| 046118b938 | |||
| b32260ab85 | |||
| f80e7866c0 | |||
| 31a4b3e6c4 | |||
| caf8b1c084 | |||
| 1b86bd8e18 | |||
| 59012df99b | |||
| 3d1f67616d | |||
| 6ebaf43ee4 | |||
| 0c824fc46f | |||
| eb577e4655 | |||
| 8f36850f73 | |||
| 29fd2662ba | |||
| 30a3e5af69 | |||
| a38c1bfe09 | |||
| 320feae6f5 | |||
| 1e4ecca1d0 | |||
| c0a7b89d8e | |||
| 6f59beaf0b | |||
| 41f1cf38f2 | |||
| 08d26a1b7e | |||
| 63773a6200 | |||
| 883b42896a | |||
| e1098ced95 | |||
| d100d78eb3 | |||
| 7e4cd070b0 | |||
| 46b0779996 | |||
| de342585ff | |||
| 185d8ed44f | |||
| d9836d4517 | |||
| 5f7e8a916a | |||
| 4dbdf4a294 | |||
| c6873c4e6d | |||
| 2111b4643c | |||
| c50901f3b9 | |||
| 8229280a9c | |||
| f77df94647 | |||
| f231e5bc21 | |||
| 2161efe978 | |||
| f23b4c04fd | |||
| 93540958b8 | |||
| 44b9af5bb2 | |||
| 7cd95dc8a3 | |||
| c02058c222 | |||
| b2ea5ba677 | |||
| 824a3f403f | |||
| 05f6846ede | |||
| 20db99cc69 | |||
| 6431be808f | |||
| 4727a8afa7 | |||
| b8f603cebe | |||
| fc679696f8 | |||
| ab5e7d93f4 | |||
| 0340f45553 | |||
| 19a00eb210 | |||
| 391612e78b | |||
| 77c95f72f7 | |||
| 59f30d0448 | |||
| 43c146ca42 | |||
| 7c2ec0fe87 | |||
| 039b6bade3 | |||
| 6c04638214 | |||
| 91ac7f764d | |||
| 4be7d7c1c9 | |||
| 59b477645c | |||
| 778f554157 | |||
| d3c84297c3 | |||
| f509a20846 | |||
| 60bc25e74c | |||
| b893d661b1 | |||
| 6b6e98775f | |||
| 9c3c21c519 | |||
| 512b8affa4 | |||
| 1c0c68202c | |||
| 5f317530ec | |||
| 557b2e961d | |||
| 4e256cadc2 | |||
| d6953beb91 | |||
| 17edd8a807 | |||
| 3303cfb4ac | |||
| b7e8e4e6be | |||
| 432e1cbc23 | |||
| 201c971e96 | |||
| e0986ea07b | |||
| a964e5e6c3 | |||
| 78c1d5bfd2 | |||
| 59a85c366e | |||
| 119f00630b | |||
| a42d2df75f | |||
| 5c057e068f | |||
| ed3aeb25a4 | |||
| 86ee949128 | |||
| 4570535ec4 | |||
| 2a6dc67eb5 | |||
| f05fea1f5e | |||
| d0df145c2a | |||
| 1838cd4860 | |||
| 7d6b03381e | |||
| 7c2e91c4e0 | |||
| 736fbf4c89 | |||
| 44ea85137a | |||
| d3d649efec | |||
| ea507c3a93 | |||
| 9705fba7b7 | |||
| 2f7dbc9b42 | |||
| ea25a76c05 | |||
| 67bc0c003e | |||
| 5a05f26603 | |||
| 7ef40bb983 | |||
| 767cbb011d | |||
| 7cfa4b24bf | |||
| b71fcd4905 | |||
| 75003f34e8 | |||
| 78b8015a4d | |||
| 831b124151 | |||
| c1ffcb55da | |||
| 0879736aab | |||
| a26917332f | |||
| cd9e5b8340 | |||
| 300a59c4c3 | |||
| d76541a6c5 | |||
| dd96465fd7 | |||
| 4f8f47e87e | |||
| d78fda7cda | |||
| 73a99cc2a5 | |||
| adae0c1f43 | |||
| cbf9221992 | |||
| 5f42fc53b6 | |||
| 8ee846c27c | |||
| 812b7f54a8 | |||
| 5f2cacdb1e | |||
| aa5053e3fe | |||
| 79aa244678 | |||
| 2ed3f20dba | |||
| 48f309029a | |||
| 0e93ac0b3a | |||
| 5446ad1d24 | |||
| f9a8084e48 | |||
| 3e70e3d4d5 | |||
| eb0fa43868 | |||
| 0ad9951c41 | |||
| 8c9117181d | |||
| c4b48d3c0f | |||
| 10d765482d | |||
| 39b643dc1a | |||
| 711f485643 | |||
| 9c5ee91b2a | |||
| 27edd2aeb4 | |||
| e5017cd6d6 | |||
| 6a7796e871 | |||
| 47b9339546 | |||
| 5d5146eee3 | |||
| 2aaa423842 | |||
| ad2d788016 | |||
| 36ce76c632 | |||
| f1fc2107a3 | |||
| 13cdc02173 | |||
| 502640c3f9 | |||
| 3d5f1c8640 | |||
| 1cab2f9cad | |||
| 1e50f1be70 | |||
| ad87ba927a | |||
| decf7f794b | |||
| d00d652998 | |||
| 3b279a84be | |||
| 5e4a8223c6 | |||
| e51de388a2 | |||
| cc253b73d3 | |||
| 7d6fb905d9 | |||
| 418d111f8c | |||
| be8921fbba | |||
| d4e7a1152d | |||
| be22bb6f3d | |||
| 169313b9f8 | |||
| 0b018d8baf | |||
| c31246800c | |||
| 4134312b35 | |||
| da554f932e | |||
| aac622e0cd | |||
| 1726e93ef1 | |||
| ee04c0cd04 | |||
| c36f0aa300 | |||
| 5234dc7451 | |||
| 3b7c20a6b5 | |||
| f9e714813a | |||
| 2518230d3e | |||
| a332b84578 | |||
| 1405f0c7ba | |||
| 84d57342b6 | |||
| 57b46d769e | |||
| f48b6a03ba | |||
| 2a69ab4899 | |||
| 8d7da92fd7 | |||
| e952eee698 | |||
| 66bca9b8bd | |||
| 99028fda44 | |||
| 1244948885 | |||
| a73f6491c8 | |||
| 001e50c92c | |||
| 96ebcaa3ad | |||
| 5db1870bb9 | |||
| 2ce26b9b5d | |||
| a388252ac4 | |||
| 9a9f48dff7 | |||
| 67f3fb0844 | |||
| 43b752c325 | |||
| cfd302db9b | |||
| fb610ae684 | |||
| 2f652e6cdf | |||
| e6a226efba | |||
| a2e6fa7e03 | |||
| 9f1c4ecaf2 | |||
| ef283548f7 | |||
| f4db5e6de1 | |||
| 099aaee536 | |||
| 35fe398c7c | |||
| bb6d43047e | |||
| bc546f76a1 | |||
| 80608ba5af | |||
| e184c9c510 | |||
| d7e34b4210 | |||
| ef6e0e7132 | |||
| 1ad3aca682 | |||
| 8d0afa9b42 | |||
| fa7e254a7f | |||
| e23cacda35 | |||
| 2e1b8bc2b6 | |||
| e47433b3c1 | |||
| 23194d83e8 | |||
| 61aedb5ffe | |||
| d3bd171123 | |||
| 89e4050af4 | |||
| 78a47f87ce | |||
| 6a113d9aed | |||
| 2e4fe48c37 | |||
| 8eb0a1d906 | |||
| fea3e476aa | |||
| 61a3431613 | |||
| 9bedac9623 | |||
| c42ff4f4fd | |||
| d5ab28511c | |||
| e61eb5e09d | |||
| 0899ba5b42 | |||
| 145ac73317 | |||
| d0d138bc55 | |||
| 43227236ec | |||
| 8616300ae2 | |||
| edbaadd91f | |||
| 9360d34fa1 | |||
| 1b67b04656 | |||
| bd51f78e39 | |||
| 65ecb4f134 | |||
| 143844fa43 | |||
| 219cfbe7f6 | |||
| 9b44a7d926 | |||
| a3ae45a38c | |||
| 0307428d65 | |||
| 471997adf6 | |||
| b1ded114b9 | |||
| f4e4088c99 | |||
| 0efd540dbc | |||
| 6144754014 | |||
| 69311446ba | |||
| da63274d9f | |||
| c216119d64 | |||
| 5546acb463 | |||
| c0ec81836f | |||
| b65e56babe | |||
| 49996cd597 | |||
| ecb37e276a | |||
| a5354b3ed2 | |||
| f9df8b4ad7 | |||
| ec152c8748 | |||
| 7977e5027c | |||
| 3f5d902d2a | |||
| 27d7638b94 | |||
| 176173989a | |||
| 23b8ee672d | |||
| 3939152069 | |||
| cd87bfbf37 | |||
| b3613e3ace | |||
| d346ec695e | |||
| c242c98031 | |||
| f1d53d150c | |||
| 92da847cf5 | |||
| 3958b96bf5 | |||
| 8bf8f45822 | |||
| 6f5c0931c1 | |||
| 4e33a7ea85 | |||
| dc48ba0c75 | |||
| 4778b42660 | |||
| c70ac4b8ff | |||
| cf89202855 | |||
| f075693da7 | |||
| f708bd4904 | |||
| 0002b7f0d1 | |||
| 11aafd9886 | |||
| b761df963c | |||
| 33f6aaf972 | |||
| 56aafa8c0b | |||
| 8d52f2b3a7 | |||
| 984d18498a | |||
| d4d9899860 | |||
| db1e42f627 | |||
| bc9d7b5595 | |||
| fe6b19c314 | |||
| 2827b3f4a3 | |||
| 2b6b1d7809 | |||
| 633f943e30 | |||
| b03b1b97f6 | |||
| dfb9af2014 | |||
| 19f76ee68e | |||
| dd70437a4f | |||
| 99b3a504c5 | |||
| 6e30010d2f | |||
| 52621c8f5c | |||
| d48f4d6daf | |||
| e84e0735c7 | |||
| 3edf87d25f | |||
| 392edee34a | |||
| 983056e456 | |||
| 13dd93c667 | |||
| 53a30845be | |||
| 8b77328ffe | |||
| 9fe4c2bdb9 | |||
| 081b5594a2 | |||
| 57329a8c01 | |||
| 8c435c9bce | |||
| e71b8e210d | |||
| 89fa54e6f7 | |||
| 3d54bdcb73 | |||
| 6b0fcbbf43 | |||
| 0fa673af4c | |||
| 3468f17ebe | |||
| 71b25b0d48 | |||
| 0ea80c87d9 | |||
| b8d9e4a326 | |||
| 13cc7f5370 | |||
| 916bd9204d | |||
| e04a1b6b21 | |||
| 2e5df88c92 | |||
| 0754ac4c49 | |||
| 03858e6d1c | |||
| 532a6cfccb | |||
| eb32335e35 | |||
| 69a8c8e99a | |||
| 6c340da4df | |||
| 2f17117606 | |||
| 1e9a77e037 | |||
| d2af67441d | |||
| 0bcc3a160d | |||
| 70fbdb26e9 | |||
| 7f570f1caa | |||
| eaeca3cd7f | |||
| 12c1287d64 | |||
| 17b4c6685c | |||
| 3c2b2ccece | |||
| 7be9ffcd9f | |||
| 393de22d2e | |||
| 1260180c67 | |||
| af4ee63e0e | |||
| bc092ea873 | |||
| 755ed7b05b | |||
| a676e668ee | |||
| c85be1f6dd | |||
| 845adb3ec6 | |||
| 90b139cfff | |||
| 4492e3a554 | |||
| 05c19485a5 | |||
| 52d0cb8458 | |||
| 5c1e496a75 | |||
| e7f27ea648 | |||
| 1f29141258 | |||
| 6160ba4151 | |||
| fea8006062 | |||
| e6750d0b18 | |||
| 8c853050e7 | |||
| f84a472a03 | |||
| 54e42b72db | |||
| 2dda3e35d0 | |||
| d83f3f7cb3 | |||
| 302eb941f3 | |||
| 487745ff49 | |||
| 9313be5017 | |||
| 8938774c79 | |||
| e18b714b2e | |||
| b1068903fd | |||
| 164299500b | |||
| 58c360d9be | |||
| 42488dae69 | |||
| b67dece2d8 | |||
| 2338daffd3 | |||
| 2e19a848d4 | |||
| 77a7fce1bb | |||
| 6488f3481b | |||
| 27ec3c78f3 | |||
| 1cbcfb94de | |||
| fed8a9b107 | |||
| 190c45a6af | |||
| 5caaeb714c | |||
| d747c2ef18 | |||
| c30b405b8f | |||
| 77d906995c | |||
| 359d293006 | |||
| 9df8da548e | |||
| bf68fd76a9 | |||
| de94289a98 | |||
| 1983609239 | |||
| d06b5a95cb | |||
| be0bb568c9 | |||
| c8bde93367 | |||
| 88d7bdbd23 | |||
| 0d235b874a | |||
| 7ad5e50adf | |||
| dc464a3d39 | |||
| 1210e4d95b | |||
| e0b24ea030 | |||
| bde2a1a8a4 | |||
| 5e25b12236 | |||
| c85d75cf08 | |||
| abad204be6 | |||
| 7361ab379f | |||
| 95bc60e4cb | |||
| 4f2954f724 | |||
| eca7be9077 | |||
| 969b4da3a6 | |||
| 4f8c4b890a | |||
| ae002924e9 | |||
| 690f948e4a | |||
| 08275ec0a2 | |||
| c828d1bf98 | |||
| 8b8a8afc89 | |||
| 8bdd8b5c51 | |||
| a8ffc4f0f2 | |||
| d5944d5146 | |||
| 24fab45d96 | |||
| 63400259d0 | |||
| 8c1c81a3de | |||
| a3a7828010 | |||
| 5abb117901 | |||
| 867ecdd1c8 | |||
| 24e8222745 | |||
| 100b630a60 | |||
| 527821d191 | |||
| 846197f505 | |||
| 2357480b1a | |||
| f11e3c516b | |||
| 875d6def90 | |||
| cc1dc7ed6d | |||
| a903669e10 | |||
| 2c58742dff | |||
| 4c966e440e | |||
| da5e7e4329 | |||
| f05a4f0e34 | |||
| 61d1b35561 | |||
| b6a136b58c | |||
| 0d9fe260dd | |||
| 273690a50a | |||
| 231c2c63e4 | |||
| 4322c553a6 | |||
| babad6e5dd | |||
| 9383cd6f10 | |||
| ba8d2165b6 | |||
| c98be0a232 | |||
| 5774b0a1da | |||
| e8db44f883 | |||
| fafbe11af4 | |||
| 78237e43bf | |||
| eea1783989 | |||
| f225ea7dd9 | |||
| fc97733da8 | |||
| 4741239db7 | |||
| c625f9043c | |||
| 6fa78d8f23 | |||
| 9949aa2ef1 | |||
| 0b7bed9c38 | |||
| ac0048c0ae | |||
| 090197034f | |||
| f31ff87460 | |||
| d588cd2406 | |||
| 45d7d852d3 | |||
| 8bed179109 | |||
| f552d5e578 | |||
| 8db2939289 | |||
| d5e0fca264 | |||
| 8d0ee5a564 | |||
| 922979bfcc | |||
| 239ef0c1ac | |||
| 1d7f95b85c | |||
| cfbee3d0e7 | |||
| 06a41334c7 | |||
| 175811e3b5 | |||
| c10101a3eb | |||
| ac243886b0 | |||
| 3d2c56b7a9 | |||
| 64c824cd78 | |||
| 417a164af6 | |||
| b6f01bd9a7 | |||
| 4cf71cc88a | |||
| a66d131381 | |||
| 21467f9a1c | |||
| f92d952632 | |||
| 6d0b827cbd | |||
| 0eecb31663 | |||
| 793be8d057 | |||
| 7b57a433da | |||
| 5aeb925452 | |||
| 04d3752329 | |||
| bc6e542d9f | |||
| af7dfb0d1a | |||
| 1c3ffdbecc | |||
| c438b2951c | |||
| 0ff8ebb2d7 | |||
| 26e673fe93 | |||
| 65a5910ce3 | |||
| 9aea7373ff | |||
| 30d08911f7 | |||
| cf56cf78b4 | |||
| 7ed82d1974 | |||
| 12dbd834cf | |||
| 035fd2bd2c | |||
| 1cd885bd54 | |||
| 62b38dc832 | |||
| c99db8c8dd | |||
| 72dd1595b4 | |||
| 572ddf83ce | |||
| 86647d1cd0 | |||
| 52c2a8d4ad | |||
| 367a480bd3 | |||
| bef180f009 | |||
| d88918e4c2 | |||
| 3c713a9711 | |||
| bf8b26cad1 | |||
| 032d661d27 | |||
| e08a3a3fdb | |||
| 3d9a1d2de5 | |||
| be874c0201 | |||
| 9607d5eb44 | |||
| c60e6137f0 | |||
| f91480b2d4 | |||
| 6c5f82e5aa | |||
| b7f186bbb3 | |||
| 3642909617 | |||
| c308501cb6 | |||
| 535d80056b | |||
| a25ade5d47 | |||
| 8945b001db | |||
| b8a287a0a8 | |||
| c7e713616a | |||
| a36c675817 | |||
| 3da17c2cc2 | |||
| 14c1432789 | |||
| ee7a66dd9a | |||
| 431535b522 | |||
| 711e912946 | |||
| e69e0b8b5f | |||
| ddc9048394 | |||
| b1a63d1b3b | |||
| 48ecb4438b | |||
| e57fc15971 | |||
| 4bdf400218 | |||
| 7852b82b93 | |||
| a2a5f79e09 | |||
| c59a0eca42 | |||
| b716ab93a7 | |||
| 138f0d1e75 | |||
| 2506ce5189 | |||
| 47fd08aaf9 | |||
| 12aed7e453 | |||
| d90e212a3a | |||
| 2821986450 | |||
| 6c117cff7d | |||
| 7ac67ea525 | |||
| ce75e15373 | |||
| aed16879a9 | |||
| cf278ff3b2 | |||
| 838d7116ba | |||
| 5089fd749c | |||
| a3d087adec | |||
| 058525b997 | |||
| 1dfea5f4a9 | |||
| cea91a32f2 | |||
| a684c0124c | |||
| f2718d2948 | |||
| 825fdb11ad | |||
| 8c1d4acbfe | |||
| 486c5599e3 | |||
| a6149aa587 | |||
| 6c8a3c099b | |||
| 31a8a2a7bc | |||
| 1a0a04dae9 | |||
| 6d8246aaff | |||
| 9d1c50a5ac | |||
| 9a4600e4dc | |||
| 9fac6aa30b | |||
| a53ad626d6 | |||
| 1c3dad22ff | |||
| d2a30a2d93 | |||
| 75fb112d80 | |||
| 38db529f66 | |||
| 064cac7bb7 | |||
| e19bce40a1 | |||
| 505805b645 | |||
| bbdc0f2366 | |||
| dc34059360 | |||
| c4cb0af98a | |||
| 1c3b1634aa | |||
| 2ea50e977a | |||
| b419937c78 | |||
| 5f696c33b1 | |||
| 67244c86f0 | |||
| 072d7e53e5 | |||
| 01a583fea4 | |||
| bc19d75985 | |||
| fbd6523ac0 | |||
| 470484a4f5 | |||
| 21da73343a | |||
| 66072b36db | |||
| 3ed1ec4af2 | |||
| 5a33ae9a3f | |||
| c9ff9e6f0c | |||
| eaffe4486c | |||
| 8ed039d527 | |||
| 37970105fe | |||
| cc935fdd7e | |||
| abdfcd4f3d | |||
| 4f02b77de4 | |||
| 29283e8976 | |||
| 05b044e698 | |||
| aa3f105c59 | |||
| ef7eefe17a | |||
| 350c94deb3 | |||
| f4cd80f944 | |||
| 349e0e3462 | |||
| 81b16a2bc9 | |||
| e111d5b0ae | |||
| a904ea78ea | |||
| b7433ca1a4 | |||
| 5c65a72bb1 | |||
| 9d8a2d86d2 | |||
| 3bc18127ff | |||
| bec060fd99 | |||
| 52bc9d5b3e | |||
| dc2979c585 | |||
| 027d37df38 | |||
| b98219670f | |||
| 32baf1d036 | |||
| 3127274d02 | |||
| 4ac510f484 | |||
| 7fb2a5be28 | |||
| 6c036615dc | |||
| 2fc24e94f9 | |||
| 2c3c1bd07a | |||
| 5963b98b46 | |||
| e6585ddb45 | |||
| 2a4d6412e6 | |||
| e67a79db03 | |||
| 9f882d8791 | |||
| 1a456c7c90 | |||
| fedb75fa27 | |||
| bff2e5f1d6 | |||
| 3c068c637b | |||
| f20c3b0951 | |||
| 883131544f | |||
| ee5fd49150 | |||
| 7ae9887542 | |||
| e3db5ebb66 | |||
| 9d442b7c48 | |||
| eb68c2dcd9 | |||
| 8b32464ac1 | |||
| 99cc41ad50 | |||
| d6a518fdde | |||
| 4aa8c7b047 | |||
| 4b946d693e | |||
| 087c6ffc92 | |||
| 4a2d33e371 | |||
| 8f3616f422 | |||
| 47f670b03b | |||
| dd6a910aac | |||
| 1b962e2457 | |||
| bfe9380161 | |||
| 9fccd04e30 | |||
| 252ada5559 | |||
| e120533d7a | |||
| 2b85697031 | |||
| 544fe76b95 | |||
| bb58dc8c20 | |||
| 0fb2551c23 | |||
| 6c47f6bfa4 | |||
| c15309a730 | |||
| 4a9375fe9d | |||
| 03191cd8f0 | |||
| b77bf34e53 | |||
| dd39baf717 | |||
| 43a62c51be | |||
| ca2d1925ef | |||
| 0f7acdd73c | |||
| 5801e49776 | |||
| 58d4c705a8 | |||
| ea3de5ef0d | |||
| 67532a1a68 | |||
| 5672ba90bd | |||
| dd83a157f1 | |||
| 5a411ef6c4 | |||
| eeb135eb87 | |||
| 3059b9cc6b | |||
| 64ad551878 | |||
| cef32104b4 | |||
| 493b10f8bf | |||
| d119fc8614 | |||
| dbebb7f812 | |||
| 3053a22b33 | |||
| 02d4b85454 | |||
| 86daa875fe | |||
| dcf2f3ec06 | |||
| 218454b9b2 | |||
| f4d6eb95cf | |||
| cd1f885bcf | |||
| d593cf28fa | |||
| faa7a5daac | |||
| 567939953b | |||
| 08369289af | |||
| 73cfb3c5ee | |||
| 4e5affeaa1 | |||
| e4f0b4cd96 | |||
| de3e53a75b | |||
| 85e0df1392 | |||
| 0faf3cc3e8 | |||
| 7ea5c73ad7 | |||
| 27fcfe7bcf | |||
| 68dbde5dbb | |||
| 04ad0dc275 | |||
| 238c4c1705 | |||
| 8c54610265 | |||
| 17871983a2 | |||
| 759ef49b15 | |||
| 5206ab20ba | |||
| 0af3ce1355 | |||
| e1279ef00f | |||
| 2942970d44 | |||
| 3c96e7b8a1 | |||
| b42566f440 | |||
| d96e11167d | |||
| 2891603efd | |||
| de2cc3d867 | |||
| e95084308b | |||
| 7f6f2c1182 | |||
| 5bcc153d7b | |||
| 45bfa49cb8 | |||
| fd2f10546c | |||
| e757a629e7 | |||
| aae725af7c | |||
| 73df49ef3a | |||
| 25aba2b6a3 | |||
| 94b03f88dd | |||
| 49bfc538e4 | |||
| a0b26701c9 | |||
| c4afdb69cc | |||
| b834b4cbf1 | |||
| 740f0647b1 | |||
| 01413e0cf5 | |||
| 0e219cd50b | |||
| 72c99f2a75 | |||
| bf214ca226 | |||
| 2e41f5abca | |||
| bc0f6059a2 | |||
| 8de261b04a | |||
| a0d8b9738d | |||
| 59e17dd4a0 | |||
| 4979eb79da | |||
| a8c0f59973 | |||
| f4a948f33f | |||
| 3f3313981c | |||
| 78818dd1b0 | |||
| 8e5cdcda4e | |||
| 90f3f7d73e | |||
| 6dc8da5dc1 | |||
| 79cbcab871 | |||
| ff68035932 | |||
| 1177dd53e9 | |||
| fc2dbcda8b | |||
| fec347dee1 | |||
| cc3173ae98 | |||
| 3e903b6cb4 | |||
| 973c9d01da | |||
| 15b8fef453 | |||
| cfa3234a5b | |||
| 41ae4a1eab | |||
| 4dad72f0d9 | |||
| 59d7ffc17f | |||
| 1da0f1441d | |||
| 98229db244 | |||
| dbeee3844c | |||
| 30498f2a65 | |||
| abc7989adc | |||
| 9a8966bcc2 | |||
| 5febdc8750 | |||
| 99bfef841f | |||
| 89e08d6d18 | |||
| 7f2ea7074e | |||
| 4fdd6f5cbf | |||
| 8226dd56bf | |||
| 5fe643fc26 | |||
| 7ba32aa60b | |||
| c89ed8de43 | |||
| 3beadc2f25 | |||
| bc636f21a6 | |||
| 017354c0ef | |||
| 010acc6e1e | |||
| c8c42597ab | |||
| 9d2a44606d | |||
| f17c075884 | |||
| b0d1213ac3 | |||
| 57f94e88ea | |||
| 684b6870e1 | |||
| a5b84f1cbf | |||
| 9f04d9d55f | |||
| 4d7c1d531b | |||
| 41f17bf290 | |||
| bcb06d7baf | |||
| 0377802c20 | |||
| 72fc8aa412 | |||
| fdb09c77d6 | |||
| 7a1c4025f1 | |||
| 60a0951924 | |||
| 64d90c3e4f | |||
| 59d5d2c736 | |||
| d21a36f5f9 | |||
| 561a0baee0 | |||
| f592b3174b | |||
| 7920de0a2a | |||
| ddcec289c7 | |||
| e090b7b45b | |||
| 6a50eaa0d3 | |||
| 12a8414d81 | 
@ -5,11 +5,11 @@ import os
 | 
				
			|||||||
import sys
 | 
					import sys
 | 
				
			||||||
import zipfile
 | 
					import zipfile
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB
 | 
					# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB
 | 
				
			||||||
# Note that we have 800 MiB quota, please use it wisely.
 | 
					# Note that we have 800 MiB quota, please use it wisely.
 | 
				
			||||||
# See https://github.com/pypi/support/issues/6326 .
 | 
					# See https://github.com/pypi/support/issues/6326 .
 | 
				
			||||||
# Please also sync the value with the one in Dockerfile.
 | 
					# Please also sync the value with the one in Dockerfile.
 | 
				
			||||||
VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450))
 | 
					VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def print_top_10_largest_files(zip_file):
 | 
					def print_top_10_largest_files(zip_file):
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										12
									
								
								.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										12
									
								
								.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,12 @@
 | 
				
			|||||||
 | 
					# For vllm script, with -t option (tensor parallel size).
 | 
				
			||||||
 | 
					# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1
 | 
				
			||||||
 | 
					model_name: "HandH1998/QQQ-Llama-3-8b-g128"
 | 
				
			||||||
 | 
					tasks:
 | 
				
			||||||
 | 
					- name: "gsm8k"
 | 
				
			||||||
 | 
					  metrics:
 | 
				
			||||||
 | 
					  - name: "exact_match,strict-match"
 | 
				
			||||||
 | 
					    value: 0.419
 | 
				
			||||||
 | 
					  - name: "exact_match,flexible-extract"
 | 
				
			||||||
 | 
					    value: 0.416
 | 
				
			||||||
 | 
					limit: 1000
 | 
				
			||||||
 | 
					num_fewshot: 5
 | 
				
			||||||
@ -0,0 +1,12 @@
 | 
				
			|||||||
 | 
					# For hf script, without -t option (tensor parallel size).
 | 
				
			||||||
 | 
					# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8
 | 
				
			||||||
 | 
					model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
 | 
				
			||||||
 | 
					backend: "vllm-vlm"
 | 
				
			||||||
 | 
					tasks:
 | 
				
			||||||
 | 
					- name: "chartqa"
 | 
				
			||||||
 | 
					  metrics:
 | 
				
			||||||
 | 
					  - name: "relaxed_accuracy,none"
 | 
				
			||||||
 | 
					    # TODO(zhewenl): model card is 0.90, but the actual score is 0.80.
 | 
				
			||||||
 | 
					    value: 0.80
 | 
				
			||||||
 | 
					limit: 100
 | 
				
			||||||
 | 
					num_fewshot: 0
 | 
				
			||||||
@ -0,0 +1,10 @@
 | 
				
			|||||||
 | 
					# For hf script, without -t option (tensor parallel size).
 | 
				
			||||||
 | 
					# bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5
 | 
				
			||||||
 | 
					model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
 | 
				
			||||||
 | 
					tasks:
 | 
				
			||||||
 | 
					- name: "mmlu_pro"
 | 
				
			||||||
 | 
					  metrics:
 | 
				
			||||||
 | 
					  - name: "exact_match,custom-extract"
 | 
				
			||||||
 | 
					    value: 0.80
 | 
				
			||||||
 | 
					limit: 250 # will run on 250 * 14 subjects = 3500 samples
 | 
				
			||||||
 | 
					num_fewshot: 5
 | 
				
			||||||
@ -1,4 +1,5 @@
 | 
				
			|||||||
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1
 | 
					# For vllm script, with -t option (tensor parallel size)
 | 
				
			||||||
 | 
					# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1
 | 
				
			||||||
model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
 | 
					model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic"
 | 
				
			||||||
tasks:
 | 
					tasks:
 | 
				
			||||||
- name: "gsm8k"
 | 
					- name: "gsm8k"
 | 
				
			||||||
 | 
				
			|||||||
@ -0,0 +1,12 @@
 | 
				
			|||||||
 | 
					# For vllm script, with -t option (tensor parallel size).
 | 
				
			||||||
 | 
					# bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
 | 
				
			||||||
 | 
					backend: "vllm-vlm"
 | 
				
			||||||
 | 
					tasks:
 | 
				
			||||||
 | 
					- name: "chartqa"
 | 
				
			||||||
 | 
					  metrics:
 | 
				
			||||||
 | 
					  - name: "relaxed_accuracy,none"
 | 
				
			||||||
 | 
					    value: 0.855
 | 
				
			||||||
 | 
					limit: 2500
 | 
				
			||||||
 | 
					num_fewshot: 0
 | 
				
			||||||
							
								
								
									
										1
									
								
								.buildkite/lm-eval-harness/configs/models-large-h100.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								.buildkite/lm-eval-harness/configs/models-large-h100.txt
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
 | 
				
			||||||
@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml
 | 
				
			||||||
							
								
								
									
										1
									
								
								.buildkite/lm-eval-harness/configs/models-mm-small.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								.buildkite/lm-eval-harness/configs/models-mm-small.txt
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					Qwen2.5-VL-7B-Instruct.yaml
 | 
				
			||||||
							
								
								
									
										44
									
								
								.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										44
									
								
								.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
									
									
									
									
									
										Executable file
									
								
							@ -0,0 +1,44 @@
 | 
				
			|||||||
 | 
					#!/bin/bash
 | 
				
			||||||
 | 
					# We can use this script to compute baseline accuracy on chartqa for vllm.
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Make sure you have lm-eval-harness installed:
 | 
				
			||||||
 | 
					#   pip install lm-eval==0.4.9
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					usage() {
 | 
				
			||||||
 | 
					    echo``
 | 
				
			||||||
 | 
					    echo "Runs lm eval harness on ChartQA using multimodal vllm."
 | 
				
			||||||
 | 
					    echo "This pathway is intended to be used to create baselines for "
 | 
				
			||||||
 | 
					    echo "our correctness tests in vllm's CI."
 | 
				
			||||||
 | 
					    echo
 | 
				
			||||||
 | 
					    echo "usage: ${0} <options>"
 | 
				
			||||||
 | 
					    echo
 | 
				
			||||||
 | 
					    echo "  -m    - huggingface stub or local directory of the model"
 | 
				
			||||||
 | 
					    echo "  -l    - limit number of samples to run"
 | 
				
			||||||
 | 
					    echo "  -t    - tensor parallel size to run at"
 | 
				
			||||||
 | 
					    echo
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					while getopts "m:l:t:" OPT; do
 | 
				
			||||||
 | 
					  case ${OPT} in
 | 
				
			||||||
 | 
					    m ) 
 | 
				
			||||||
 | 
					        MODEL="$OPTARG"
 | 
				
			||||||
 | 
					        ;;
 | 
				
			||||||
 | 
					    l ) 
 | 
				
			||||||
 | 
					        LIMIT="$OPTARG"
 | 
				
			||||||
 | 
					        ;;
 | 
				
			||||||
 | 
					    t ) 
 | 
				
			||||||
 | 
					        TP_SIZE="$OPTARG"
 | 
				
			||||||
 | 
					        ;;
 | 
				
			||||||
 | 
					    \? ) 
 | 
				
			||||||
 | 
					        usage
 | 
				
			||||||
 | 
					        exit 1
 | 
				
			||||||
 | 
					        ;;
 | 
				
			||||||
 | 
					  esac
 | 
				
			||||||
 | 
					done
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					lm_eval --model vllm-vlm \
 | 
				
			||||||
 | 
					  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \
 | 
				
			||||||
 | 
					  --tasks chartqa \
 | 
				
			||||||
 | 
					  --batch_size auto \
 | 
				
			||||||
 | 
					  --apply_chat_template \
 | 
				
			||||||
 | 
					  --limit $LIMIT
 | 
				
			||||||
							
								
								
									
										0
									
								
								.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							
							
						
						
									
										0
									
								
								.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							@ -0,0 +1,50 @@
 | 
				
			|||||||
 | 
					#!/bin/bash
 | 
				
			||||||
 | 
					# We can use this script to compute baseline accuracy on MMLUPRO for vllm.
 | 
				
			||||||
 | 
					# We use this for fp8, which HF does not support.
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Make sure you have lm-eval-harness installed:
 | 
				
			||||||
 | 
					#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					usage() {
 | 
				
			||||||
 | 
					    echo``
 | 
				
			||||||
 | 
					    echo "Runs lm eval harness on MMLU Pro using huggingface transformers."
 | 
				
			||||||
 | 
					    echo "This pathway is intended to be used to create baselines for "
 | 
				
			||||||
 | 
					    echo "our automated nm-test-accuracy workflow"
 | 
				
			||||||
 | 
					    echo
 | 
				
			||||||
 | 
					    echo "usage: ${0} <options>"
 | 
				
			||||||
 | 
					    echo
 | 
				
			||||||
 | 
					    echo "  -m    - huggingface stub or local directory of the model"
 | 
				
			||||||
 | 
					    echo "  -l    - limit number of samples to run"
 | 
				
			||||||
 | 
					    echo "  -f    - number of fewshot samples to use"
 | 
				
			||||||
 | 
					    echo "  -t    - tensor parallel size to run at"
 | 
				
			||||||
 | 
					    echo
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					while getopts "m:b:l:f:t:" OPT; do
 | 
				
			||||||
 | 
					  case ${OPT} in
 | 
				
			||||||
 | 
					    m )
 | 
				
			||||||
 | 
					        MODEL="$OPTARG"
 | 
				
			||||||
 | 
					        ;;
 | 
				
			||||||
 | 
					    b )
 | 
				
			||||||
 | 
					        BATCH_SIZE="$OPTARG"
 | 
				
			||||||
 | 
					        ;;
 | 
				
			||||||
 | 
					    l )
 | 
				
			||||||
 | 
					        LIMIT="$OPTARG"
 | 
				
			||||||
 | 
					        ;;
 | 
				
			||||||
 | 
					    f )
 | 
				
			||||||
 | 
					        FEWSHOT="$OPTARG"
 | 
				
			||||||
 | 
					        ;;
 | 
				
			||||||
 | 
					    t )
 | 
				
			||||||
 | 
					        TP_SIZE="$OPTARG"
 | 
				
			||||||
 | 
					        ;;
 | 
				
			||||||
 | 
					    \? )
 | 
				
			||||||
 | 
					        usage
 | 
				
			||||||
 | 
					        exit 1
 | 
				
			||||||
 | 
					        ;;
 | 
				
			||||||
 | 
					  esac
 | 
				
			||||||
 | 
					done
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					lm_eval --model vllm \
 | 
				
			||||||
 | 
					  --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \
 | 
				
			||||||
 | 
					  --tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \
 | 
				
			||||||
 | 
					  --batch_size auto
 | 
				
			||||||
@ -19,21 +19,27 @@ RTOL = 0.08
 | 
				
			|||||||
def launch_lm_eval(eval_config, tp_size):
 | 
					def launch_lm_eval(eval_config, tp_size):
 | 
				
			||||||
    trust_remote_code = eval_config.get("trust_remote_code", False)
 | 
					    trust_remote_code = eval_config.get("trust_remote_code", False)
 | 
				
			||||||
    max_model_len = eval_config.get("max_model_len", 4096)
 | 
					    max_model_len = eval_config.get("max_model_len", 4096)
 | 
				
			||||||
 | 
					    batch_size = eval_config.get("batch_size", "auto")
 | 
				
			||||||
 | 
					    backend = eval_config.get("backend", "vllm")
 | 
				
			||||||
    model_args = (
 | 
					    model_args = (
 | 
				
			||||||
        f"pretrained={eval_config['model_name']},"
 | 
					        f"pretrained={eval_config['model_name']},"
 | 
				
			||||||
        f"tensor_parallel_size={tp_size},"
 | 
					        f"tensor_parallel_size={tp_size},"
 | 
				
			||||||
        f"enforce_eager=true,"
 | 
					        f"enforce_eager=true,"
 | 
				
			||||||
        f"add_bos_token=true,"
 | 
					        f"add_bos_token=true,"
 | 
				
			||||||
        f"trust_remote_code={trust_remote_code},"
 | 
					        f"trust_remote_code={trust_remote_code},"
 | 
				
			||||||
        f"max_model_len={max_model_len}"
 | 
					        f"max_model_len={max_model_len},"
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    results = lm_eval.simple_evaluate(
 | 
					    results = lm_eval.simple_evaluate(
 | 
				
			||||||
        model="vllm",
 | 
					        model=backend,
 | 
				
			||||||
        model_args=model_args,
 | 
					        model_args=model_args,
 | 
				
			||||||
        tasks=[task["name"] for task in eval_config["tasks"]],
 | 
					        tasks=[task["name"] for task in eval_config["tasks"]],
 | 
				
			||||||
        num_fewshot=eval_config["num_fewshot"],
 | 
					        num_fewshot=eval_config["num_fewshot"],
 | 
				
			||||||
        limit=eval_config["limit"],
 | 
					        limit=eval_config["limit"],
 | 
				
			||||||
        batch_size="auto",
 | 
					        # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help
 | 
				
			||||||
 | 
					        # text models. however, this is regressing measured strict-match for
 | 
				
			||||||
 | 
					        # existing text models in CI, so only apply it for mm.
 | 
				
			||||||
 | 
					        apply_chat_template=backend == "vllm-vlm",
 | 
				
			||||||
 | 
					        batch_size=batch_size,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    return results
 | 
					    return results
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -8,7 +8,7 @@ This benchmark aims to:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
 | 
					Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html), scroll to the end.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
 | 
					Latest reproduction guide: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Setup
 | 
					## Setup
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -368,7 +368,7 @@ if __name__ == "__main__":
 | 
				
			|||||||
        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
 | 
					        # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
 | 
				
			||||||
        # we want to turn it into "8xGPUTYPE"
 | 
					        # we want to turn it into "8xGPUTYPE"
 | 
				
			||||||
        df["GPU"] = df["GPU"].apply(
 | 
					        df["GPU"] = df["GPU"].apply(
 | 
				
			||||||
            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
 | 
					            lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # get markdown tables
 | 
					    # get markdown tables
 | 
				
			||||||
 | 
				
			|||||||
@ -181,18 +181,14 @@ launch_vllm_server() {
 | 
				
			|||||||
  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
 | 
					  if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
 | 
				
			||||||
    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
 | 
					    echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
 | 
				
			||||||
    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
 | 
					    model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
 | 
				
			||||||
    server_command="python3 \
 | 
					    server_command="vllm serve $model \
 | 
				
			||||||
        -m vllm.entrypoints.openai.api_server \
 | 
					 | 
				
			||||||
        -tp $tp \
 | 
					        -tp $tp \
 | 
				
			||||||
        --model $model \
 | 
					 | 
				
			||||||
        --port $port \
 | 
					        --port $port \
 | 
				
			||||||
        $server_args"
 | 
					        $server_args"
 | 
				
			||||||
  else
 | 
					  else
 | 
				
			||||||
    echo "Key 'fp8' does not exist in common params."
 | 
					    echo "Key 'fp8' does not exist in common params."
 | 
				
			||||||
    server_command="python3 \
 | 
					    server_command="vllm serve $model \
 | 
				
			||||||
        -m vllm.entrypoints.openai.api_server \
 | 
					 | 
				
			||||||
        -tp $tp \
 | 
					        -tp $tp \
 | 
				
			||||||
        --model $model \
 | 
					 | 
				
			||||||
        --port $port \
 | 
					        --port $port \
 | 
				
			||||||
        $server_args"
 | 
					        $server_args"
 | 
				
			||||||
  fi
 | 
					  fi
 | 
				
			||||||
 | 
				
			|||||||
@ -365,8 +365,7 @@ run_serving_tests() {
 | 
				
			|||||||
      continue
 | 
					      continue
 | 
				
			||||||
    fi
 | 
					    fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    server_command="$server_envs python3 \
 | 
					    server_command="$server_envs vllm serve \
 | 
				
			||||||
      -m vllm.entrypoints.openai.api_server \
 | 
					 | 
				
			||||||
      $server_args"
 | 
					      $server_args"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # run the server
 | 
					    # run the server
 | 
				
			||||||
@ -455,11 +454,6 @@ main() {
 | 
				
			|||||||
  fi
 | 
					  fi
 | 
				
			||||||
  check_hf_token
 | 
					  check_hf_token
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  # Set to v1 to run v1 benchmark
 | 
					 | 
				
			||||||
  if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
 | 
					 | 
				
			||||||
    export VLLM_USE_V1=1
 | 
					 | 
				
			||||||
  fi
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  # dependencies
 | 
					  # dependencies
 | 
				
			||||||
  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
 | 
					  (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
 | 
				
			||||||
  (which jq) || (apt-get update && apt-get -y install jq)
 | 
					  (which jq) || (apt-get update && apt-get -y install jq)
 | 
				
			||||||
 | 
				
			|||||||
@ -1,46 +0,0 @@
 | 
				
			|||||||
# This local pyproject file is part of the migration from yapf to ruff format.
 | 
					 | 
				
			||||||
# It uses the same core rules as the main pyproject.toml file, but with the
 | 
					 | 
				
			||||||
# following differences:
 | 
					 | 
				
			||||||
# - ruff line length is overridden to 88
 | 
					 | 
				
			||||||
# - deprecated typing ignores (UP006, UP035) have been removed
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[tool.ruff]
 | 
					 | 
				
			||||||
line-length = 88
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[tool.ruff.lint.per-file-ignores]
 | 
					 | 
				
			||||||
"vllm/third_party/**" = ["ALL"]
 | 
					 | 
				
			||||||
"vllm/version.py" = ["F401"]
 | 
					 | 
				
			||||||
"vllm/_version.py" = ["ALL"]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[tool.ruff.lint]
 | 
					 | 
				
			||||||
select = [
 | 
					 | 
				
			||||||
    # pycodestyle
 | 
					 | 
				
			||||||
    "E",
 | 
					 | 
				
			||||||
    # Pyflakes
 | 
					 | 
				
			||||||
    "F",
 | 
					 | 
				
			||||||
    # pyupgrade
 | 
					 | 
				
			||||||
    "UP",
 | 
					 | 
				
			||||||
    # flake8-bugbear
 | 
					 | 
				
			||||||
    "B",
 | 
					 | 
				
			||||||
    # flake8-simplify
 | 
					 | 
				
			||||||
    "SIM",
 | 
					 | 
				
			||||||
    # isort
 | 
					 | 
				
			||||||
    "I",
 | 
					 | 
				
			||||||
    # flake8-logging-format
 | 
					 | 
				
			||||||
    "G",
 | 
					 | 
				
			||||||
]
 | 
					 | 
				
			||||||
ignore = [
 | 
					 | 
				
			||||||
    # star imports
 | 
					 | 
				
			||||||
    "F405", "F403",
 | 
					 | 
				
			||||||
    # lambda expression assignment
 | 
					 | 
				
			||||||
    "E731",
 | 
					 | 
				
			||||||
    # Loop control variable not used within loop body
 | 
					 | 
				
			||||||
    "B007",
 | 
					 | 
				
			||||||
    # f-string format
 | 
					 | 
				
			||||||
    "UP032",
 | 
					 | 
				
			||||||
    # Can remove once 3.10+ is the minimum Python version
 | 
					 | 
				
			||||||
    "UP007",
 | 
					 | 
				
			||||||
]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[tool.ruff.format]
 | 
					 | 
				
			||||||
docstring-code-format = true
 | 
					 | 
				
			||||||
@ -1,24 +1,37 @@
 | 
				
			|||||||
steps:
 | 
					steps:
 | 
				
			||||||
  # aarch64 + CUDA builds. PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
 | 
					  # aarch64 + CUDA builds
 | 
				
			||||||
  - label: "Build arm64 wheel - CUDA 12.9"
 | 
					  - label: "Build arm64 wheel - CUDA 12.9"
 | 
				
			||||||
 | 
					    depends_on: ~
 | 
				
			||||||
    id: build-wheel-arm64-cuda-12-9
 | 
					    id: build-wheel-arm64-cuda-12-9
 | 
				
			||||||
    agents:
 | 
					    agents:
 | 
				
			||||||
      queue: arm64_cpu_queue_postmerge
 | 
					      queue: arm64_cpu_queue_postmerge
 | 
				
			||||||
    commands:
 | 
					    commands:
 | 
				
			||||||
      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
 | 
					      # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
 | 
				
			||||||
      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
 | 
					      # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
 | 
				
			||||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
 | 
					      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
 | 
				
			||||||
      - "mkdir artifacts"
 | 
					      - "mkdir artifacts"
 | 
				
			||||||
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
 | 
					      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
 | 
				
			||||||
      - "bash .buildkite/scripts/upload-wheels.sh"
 | 
					      - "bash .buildkite/scripts/upload-wheels.sh"
 | 
				
			||||||
    env:
 | 
					    env:
 | 
				
			||||||
      DOCKER_BUILDKIT: "1"
 | 
					      DOCKER_BUILDKIT: "1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  - block: "Build CUDA 12.8 wheel"
 | 
					  # aarch64 build
 | 
				
			||||||
    key: block-build-cu128-wheel
 | 
					  - label: "Build arm64 CPU wheel"
 | 
				
			||||||
 | 
					    depends_on: ~
 | 
				
			||||||
 | 
					    id: build-wheel-arm64-cpu
 | 
				
			||||||
 | 
					    agents:
 | 
				
			||||||
 | 
					      queue: arm64_cpu_queue_postmerge
 | 
				
			||||||
 | 
					    commands:
 | 
				
			||||||
 | 
					      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg VLLM_BUILD_ACL=ON --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile.cpu ."
 | 
				
			||||||
 | 
					      - "mkdir artifacts"
 | 
				
			||||||
 | 
					      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
 | 
				
			||||||
 | 
					      - "bash .buildkite/scripts/upload-wheels.sh"
 | 
				
			||||||
 | 
					    env:
 | 
				
			||||||
 | 
					      DOCKER_BUILDKIT: "1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  # x86 + CUDA builds
 | 
				
			||||||
  - label: "Build wheel - CUDA 12.8"
 | 
					  - label: "Build wheel - CUDA 12.8"
 | 
				
			||||||
    depends_on: block-build-cu128-wheel
 | 
					    depends_on: ~
 | 
				
			||||||
    id: build-wheel-cuda-12-8
 | 
					    id: build-wheel-cuda-12-8
 | 
				
			||||||
    agents:
 | 
					    agents:
 | 
				
			||||||
      queue: cpu_queue_postmerge
 | 
					      queue: cpu_queue_postmerge
 | 
				
			||||||
@ -30,37 +43,33 @@ steps:
 | 
				
			|||||||
    env:
 | 
					    env:
 | 
				
			||||||
      DOCKER_BUILDKIT: "1"
 | 
					      DOCKER_BUILDKIT: "1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  - block: "Build CUDA 12.6 wheel"
 | 
					 | 
				
			||||||
    key: block-build-cu126-wheel
 | 
					 | 
				
			||||||
    depends_on: ~
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  - label: "Build wheel - CUDA 12.6"
 | 
					 | 
				
			||||||
    depends_on: block-build-cu126-wheel
 | 
					 | 
				
			||||||
    id: build-wheel-cuda-12-6
 | 
					 | 
				
			||||||
    agents:
 | 
					 | 
				
			||||||
      queue: cpu_queue_postmerge
 | 
					 | 
				
			||||||
    commands:
 | 
					 | 
				
			||||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.6.3 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
 | 
					 | 
				
			||||||
      - "mkdir artifacts"
 | 
					 | 
				
			||||||
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
 | 
					 | 
				
			||||||
      - "bash .buildkite/scripts/upload-wheels.sh"
 | 
					 | 
				
			||||||
    env:
 | 
					 | 
				
			||||||
      DOCKER_BUILDKIT: "1"
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
  # x86 + CUDA builds
 | 
					 | 
				
			||||||
  - label: "Build wheel - CUDA 12.9"
 | 
					  - label: "Build wheel - CUDA 12.9"
 | 
				
			||||||
    depends_on: ~
 | 
					    depends_on: ~
 | 
				
			||||||
    id: build-wheel-cuda-12-9
 | 
					    id: build-wheel-cuda-12-9
 | 
				
			||||||
    agents:
 | 
					    agents:
 | 
				
			||||||
      queue: cpu_queue_postmerge
 | 
					      queue: cpu_queue_postmerge
 | 
				
			||||||
    commands:
 | 
					    commands:
 | 
				
			||||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
 | 
					      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
 | 
				
			||||||
      - "mkdir artifacts"
 | 
					      - "mkdir artifacts"
 | 
				
			||||||
      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
 | 
					      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
 | 
				
			||||||
      - "bash .buildkite/scripts/upload-wheels.sh"
 | 
					      - "bash .buildkite/scripts/upload-wheels.sh"
 | 
				
			||||||
    env:
 | 
					    env:
 | 
				
			||||||
      DOCKER_BUILDKIT: "1"
 | 
					      DOCKER_BUILDKIT: "1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  - label: "Build wheel - CUDA 13.0"
 | 
				
			||||||
 | 
					    depends_on: ~
 | 
				
			||||||
 | 
					    id: build-wheel-cuda-13-0
 | 
				
			||||||
 | 
					    agents:
 | 
				
			||||||
 | 
					      queue: cpu_queue_postmerge
 | 
				
			||||||
 | 
					    commands:
 | 
				
			||||||
 | 
					      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=13.0.1 --build-arg BUILD_BASE_IMAGE=nvidia/cuda:13.0.1-devel-ubuntu22.04 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
 | 
				
			||||||
 | 
					      - "mkdir artifacts"
 | 
				
			||||||
 | 
					      - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
 | 
				
			||||||
 | 
					      - "bash .buildkite/scripts/upload-wheels.sh"
 | 
				
			||||||
 | 
					    env:
 | 
				
			||||||
 | 
					      DOCKER_BUILDKIT: "1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  # Build release images (12.9)
 | 
				
			||||||
  - label: "Build release image (x86)"
 | 
					  - label: "Build release image (x86)"
 | 
				
			||||||
    depends_on: ~
 | 
					    depends_on: ~
 | 
				
			||||||
    id: build-release-image-x86
 | 
					    id: build-release-image-x86
 | 
				
			||||||
@ -68,13 +77,12 @@ steps:
 | 
				
			|||||||
      queue: cpu_queue_postmerge
 | 
					      queue: cpu_queue_postmerge
 | 
				
			||||||
    commands:
 | 
					    commands:
 | 
				
			||||||
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
 | 
					      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
 | 
				
			||||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
 | 
					      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
 | 
				
			||||||
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
 | 
					      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
 | 
				
			||||||
      # re-tag to default image tag and push, just in case arm64 build fails
 | 
					      # re-tag to default image tag and push, just in case arm64 build fails
 | 
				
			||||||
      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 | 
					      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 | 
				
			||||||
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 | 
					      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  # PyTorch 2.8 aarch64 + CUDA wheel is only available on CUDA 12.9
 | 
					 | 
				
			||||||
  - label: "Build release image (arm64)"
 | 
					  - label: "Build release image (arm64)"
 | 
				
			||||||
    depends_on: ~
 | 
					    depends_on: ~
 | 
				
			||||||
    id: build-release-image-arm64
 | 
					    id: build-release-image-arm64
 | 
				
			||||||
@ -82,7 +90,7 @@ steps:
 | 
				
			|||||||
      queue: arm64_cpu_queue_postmerge
 | 
					      queue: arm64_cpu_queue_postmerge
 | 
				
			||||||
    commands:
 | 
					    commands:
 | 
				
			||||||
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
 | 
					      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
 | 
				
			||||||
      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
 | 
					      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
 | 
				
			||||||
      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
 | 
					      - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  # Add job to create multi-arch manifest
 | 
					  # Add job to create multi-arch manifest
 | 
				
			||||||
@ -102,8 +110,6 @@ steps:
 | 
				
			|||||||
    depends_on:
 | 
					    depends_on:
 | 
				
			||||||
      - create-multi-arch-manifest
 | 
					      - create-multi-arch-manifest
 | 
				
			||||||
      - build-wheel-cuda-12-8
 | 
					      - build-wheel-cuda-12-8
 | 
				
			||||||
      - build-wheel-cuda-12-6
 | 
					 | 
				
			||||||
      - build-wheel-cuda-12-9
 | 
					 | 
				
			||||||
    id: annotate-release-workflow
 | 
					    id: annotate-release-workflow
 | 
				
			||||||
    agents:
 | 
					    agents:
 | 
				
			||||||
      queue: cpu_queue_postmerge
 | 
					      queue: cpu_queue_postmerge
 | 
				
			||||||
@ -150,6 +156,22 @@ steps:
 | 
				
			|||||||
    env:
 | 
					    env:
 | 
				
			||||||
      DOCKER_BUILDKIT: "1"
 | 
					      DOCKER_BUILDKIT: "1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  - block: "Build arm64 CPU release image"
 | 
				
			||||||
 | 
					    key: block-arm64-cpu-release-image-build
 | 
				
			||||||
 | 
					    depends_on: ~
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  - label: "Build and publish arm64 CPU release image"
 | 
				
			||||||
 | 
					    depends_on: block-arm64-cpu-release-image-build
 | 
				
			||||||
 | 
					    agents:
 | 
				
			||||||
 | 
					      queue: arm64_cpu_queue_postmerge
 | 
				
			||||||
 | 
					    commands:
 | 
				
			||||||
 | 
					      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
 | 
				
			||||||
 | 
					      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
 | 
				
			||||||
 | 
					      - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:latest"
 | 
				
			||||||
 | 
					      - "docker push public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
 | 
				
			||||||
 | 
					    env:
 | 
				
			||||||
 | 
					      DOCKER_BUILDKIT: "1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  - label: "Build and publish nightly multi-arch image to DockerHub"
 | 
					  - label: "Build and publish nightly multi-arch image to DockerHub"
 | 
				
			||||||
    depends_on:
 | 
					    depends_on:
 | 
				
			||||||
      - create-multi-arch-manifest
 | 
					      - create-multi-arch-manifest
 | 
				
			||||||
@ -158,11 +180,16 @@ steps:
 | 
				
			|||||||
      queue: cpu_queue_postmerge
 | 
					      queue: cpu_queue_postmerge
 | 
				
			||||||
    commands:
 | 
					    commands:
 | 
				
			||||||
      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
 | 
					      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
 | 
				
			||||||
      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
 | 
					      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
 | 
				
			||||||
      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
 | 
					      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
 | 
				
			||||||
      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
 | 
					      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
 | 
				
			||||||
      - "docker push vllm/vllm-openai:nightly"
 | 
					      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
 | 
				
			||||||
      - "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
 | 
					      - "docker push vllm/vllm-openai:nightly-x86_64"
 | 
				
			||||||
 | 
					      - "docker push vllm/vllm-openai:nightly-aarch64"
 | 
				
			||||||
 | 
					      - "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
 | 
				
			||||||
 | 
					      - "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
 | 
				
			||||||
 | 
					      - "docker manifest push vllm/vllm-openai:nightly"
 | 
				
			||||||
 | 
					      - "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
 | 
				
			||||||
      # Clean up old nightly builds (keep only last 14)
 | 
					      # Clean up old nightly builds (keep only last 14)
 | 
				
			||||||
      - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
 | 
					      - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
 | 
				
			||||||
    plugins:
 | 
					    plugins:
 | 
				
			||||||
@ -171,3 +198,4 @@ steps:
 | 
				
			|||||||
          password-env: DOCKERHUB_TOKEN
 | 
					          password-env: DOCKERHUB_TOKEN
 | 
				
			||||||
    env:
 | 
					    env:
 | 
				
			||||||
      DOCKER_BUILDKIT: "1"
 | 
					      DOCKER_BUILDKIT: "1"
 | 
				
			||||||
 | 
					      DOCKERHUB_USERNAME: "vllmbot"
 | 
				
			||||||
 | 
				
			|||||||
@ -14,18 +14,33 @@ buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
 | 
				
			|||||||
To download the wheel:
 | 
					To download the wheel:
 | 
				
			||||||
\`\`\`
 | 
					\`\`\`
 | 
				
			||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
 | 
					aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
 | 
				
			||||||
 | 
					aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux2014_aarch64.whl .
 | 
				
			||||||
 | 
					
 | 
				
			||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
 | 
					aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
 | 
				
			||||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl . 
 | 
					aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu129/vllm-${RELEASE_VERSION}+cu129-cp38-abi3-manylinux1_x86_64.whl .
 | 
				
			||||||
\`\`\`
 | 
					\`\`\`
 | 
				
			||||||
 | 
					
 | 
				
			||||||
To download and upload the image:
 | 
					To download and upload the image:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
\`\`\`
 | 
					\`\`\`
 | 
				
			||||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
 | 
					docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
 | 
				
			||||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
 | 
					docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
 | 
				
			||||||
docker tag vllm/vllm-openai vllm/vllm-openai:latest
 | 
					
 | 
				
			||||||
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
 | 
					docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 vllm/vllm-openai:x86_64
 | 
				
			||||||
docker push vllm/vllm-openai:latest
 | 
					docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:latest-x86_64
 | 
				
			||||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}
 | 
					docker tag vllm/vllm-openai:x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
 | 
				
			||||||
 | 
					docker push vllm/vllm-openai:latest-x86_64
 | 
				
			||||||
 | 
					docker push vllm/vllm-openai:v${RELEASE_VERSION}-x86_64
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 vllm/vllm-openai:aarch64
 | 
				
			||||||
 | 
					docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:latest-aarch64
 | 
				
			||||||
 | 
					docker tag vllm/vllm-openai:aarch64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 | 
				
			||||||
 | 
					docker push vllm/vllm-openai:latest-aarch64
 | 
				
			||||||
 | 
					docker push vllm/vllm-openai:v${RELEASE_VERSION}-aarch64
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					docker manifest create vllm/vllm-openai:latest vllm/vllm-openai:latest-x86_64 vllm/vllm-openai:latest-aarch64 --amend
 | 
				
			||||||
 | 
					docker manifest create vllm/vllm-openai:v${RELEASE_VERSION} vllm/vllm-openai:v${RELEASE_VERSION}-x86_64 vllm/vllm-openai:v${RELEASE_VERSION}-aarch64 --amend
 | 
				
			||||||
 | 
					docker manifest push vllm/vllm-openai:latest
 | 
				
			||||||
 | 
					docker manifest push vllm/vllm-openai:v${RELEASE_VERSION}
 | 
				
			||||||
\`\`\`
 | 
					\`\`\`
 | 
				
			||||||
EOF 
 | 
					EOF 
 | 
				
			||||||
@ -8,20 +8,41 @@ set -ex
 | 
				
			|||||||
# DockerHub API endpoint for vllm/vllm-openai repository
 | 
					# DockerHub API endpoint for vllm/vllm-openai repository
 | 
				
			||||||
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
 | 
					REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Get DockerHub token from environment
 | 
					# Get DockerHub credentials from environment
 | 
				
			||||||
if [ -z "$DOCKERHUB_TOKEN" ]; then
 | 
					if [ -z "$DOCKERHUB_TOKEN" ]; then
 | 
				
			||||||
    echo "Error: DOCKERHUB_TOKEN environment variable is not set"
 | 
					    echo "Error: DOCKERHUB_TOKEN environment variable is not set"
 | 
				
			||||||
    exit 1
 | 
					    exit 1
 | 
				
			||||||
fi
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if [ -z "$DOCKERHUB_USERNAME" ]; then
 | 
				
			||||||
 | 
					    echo "Error: DOCKERHUB_USERNAME environment variable is not set"
 | 
				
			||||||
 | 
					    exit 1
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Get DockerHub bearer token
 | 
				
			||||||
 | 
					echo "Getting DockerHub bearer token..."
 | 
				
			||||||
 | 
					set +x
 | 
				
			||||||
 | 
					BEARER_TOKEN=$(curl -s -X POST \
 | 
				
			||||||
 | 
					    -H "Content-Type: application/json" \
 | 
				
			||||||
 | 
					    -d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \
 | 
				
			||||||
 | 
					    "https://hub.docker.com/v2/users/login" | jq -r '.token')
 | 
				
			||||||
 | 
					set -x
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then
 | 
				
			||||||
 | 
					    echo "Error: Failed to get DockerHub bearer token"
 | 
				
			||||||
 | 
					    exit 1
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Function to get all tags from DockerHub
 | 
					# Function to get all tags from DockerHub
 | 
				
			||||||
get_all_tags() {
 | 
					get_all_tags() {
 | 
				
			||||||
    local page=1
 | 
					    local page=1
 | 
				
			||||||
    local all_tags=""
 | 
					    local all_tags=""
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    while true; do
 | 
					    while true; do
 | 
				
			||||||
        local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
 | 
					        set +x
 | 
				
			||||||
 | 
					        local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \
 | 
				
			||||||
            "$REPO_API_URL?page=$page&page_size=100")
 | 
					            "$REPO_API_URL?page=$page&page_size=100")
 | 
				
			||||||
 | 
					        set -x
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
        # Get both last_updated timestamp and tag name, separated by |
 | 
					        # Get both last_updated timestamp and tag name, separated by |
 | 
				
			||||||
        local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
 | 
					        local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
 | 
				
			||||||
@ -43,7 +64,9 @@ delete_tag() {
 | 
				
			|||||||
    echo "Deleting tag: $tag_name"
 | 
					    echo "Deleting tag: $tag_name"
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
 | 
					    local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
 | 
				
			||||||
    local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
 | 
					    set +x
 | 
				
			||||||
 | 
					    local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
 | 
				
			||||||
 | 
					    set -x
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
 | 
					    if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
 | 
				
			||||||
        echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
 | 
					        echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
 | 
				
			||||||
 | 
				
			|||||||
@ -86,10 +86,6 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
 | 
				
			|||||||
  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
 | 
					  commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
 | 
				
			||||||
fi
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
 | 
					 | 
				
			||||||
  commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
 | 
					 | 
				
			||||||
fi
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
 | 
					if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
 | 
				
			||||||
  commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
 | 
					  commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
 | 
				
			||||||
fi
 | 
					fi
 | 
				
			||||||
@ -167,12 +163,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
 | 
				
			|||||||
  --ignore=entrypoints/llm/test_prompt_validation.py "}
 | 
					  --ignore=entrypoints/llm/test_prompt_validation.py "}
 | 
				
			||||||
fi
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#Obsolete currently
 | 
					 | 
				
			||||||
##ignore certain Entrypoints/llm tests
 | 
					 | 
				
			||||||
#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
 | 
					 | 
				
			||||||
#  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
 | 
					 | 
				
			||||||
#fi
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# --ignore=entrypoints/openai/test_encoder_decoder.py \
 | 
					# --ignore=entrypoints/openai/test_encoder_decoder.py \
 | 
				
			||||||
# --ignore=entrypoints/openai/test_embedding.py \
 | 
					# --ignore=entrypoints/openai/test_embedding.py \
 | 
				
			||||||
# --ignore=entrypoints/openai/test_oot_registration.py
 | 
					# --ignore=entrypoints/openai/test_oot_registration.py
 | 
				
			||||||
 | 
				
			|||||||
@ -25,25 +25,28 @@ function cpu_tests() {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  # offline inference
 | 
					  # offline inference
 | 
				
			||||||
  podman exec -it "$container_id" bash -c "
 | 
					  podman exec -it "$container_id" bash -c "
 | 
				
			||||||
    set -e
 | 
					    set -xve
 | 
				
			||||||
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
 | 
					    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  # Run basic model test
 | 
					  # Run basic model test
 | 
				
			||||||
  podman exec -it "$container_id" bash -c "
 | 
					  podman exec -it "$container_id" bash -c "
 | 
				
			||||||
    set -e
 | 
					    set -evx
 | 
				
			||||||
    pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
 | 
					    pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib
 | 
				
			||||||
    pip install sentence-transformers datamodel_code_generator
 | 
					    pip install sentence-transformers datamodel_code_generator
 | 
				
			||||||
    pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
 | 
					
 | 
				
			||||||
 | 
					    # Note: disable Bart until supports V1
 | 
				
			||||||
 | 
					    # pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model
 | 
				
			||||||
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
 | 
					    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2]
 | 
				
			||||||
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
 | 
					    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
 | 
				
			||||||
    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
 | 
					    pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
 | 
				
			||||||
    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
 | 
					    pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
 | 
				
			||||||
    pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
 | 
					    # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being.
 | 
				
			||||||
 | 
					    # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# All of CPU tests are expected to be finished less than 40 mins.
 | 
					# All of CPU tests are expected to be finished less than 40 mins.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
export container_id
 | 
					export container_id
 | 
				
			||||||
export -f cpu_tests
 | 
					export -f cpu_tests
 | 
				
			||||||
timeout 40m bash -c cpu_tests
 | 
					timeout 120m bash -c cpu_tests
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -58,15 +58,11 @@ function cpu_tests() {
 | 
				
			|||||||
    # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
 | 
					    # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
 | 
				
			||||||
    # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
 | 
					    # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Note: disable Bart until supports V1
 | 
					    pytest -x -v -s tests/models/language/generation -m cpu_model
 | 
				
			||||||
    pytest -x -v -s tests/models/language/generation -m cpu_model \
 | 
					    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
 | 
				
			||||||
                --ignore=tests/models/language/generation/test_bart.py
 | 
					 | 
				
			||||||
    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
 | 
					 | 
				
			||||||
                --ignore=tests/models/language/generation/test_bart.py
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    pytest -x -v -s tests/models/language/pooling -m cpu_model
 | 
					    pytest -x -v -s tests/models/language/pooling -m cpu_model
 | 
				
			||||||
    pytest -x -v -s tests/models/multimodal/generation \
 | 
					    pytest -x -v -s tests/models/multimodal/generation \
 | 
				
			||||||
                --ignore=tests/models/multimodal/generation/test_mllama.py \
 | 
					 | 
				
			||||||
                --ignore=tests/models/multimodal/generation/test_pixtral.py \
 | 
					                --ignore=tests/models/multimodal/generation/test_pixtral.py \
 | 
				
			||||||
                -m cpu_model"
 | 
					                -m cpu_model"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -74,7 +70,7 @@ function cpu_tests() {
 | 
				
			|||||||
  docker exec cpu-test-"$NUMA_NODE" bash -c "
 | 
					  docker exec cpu-test-"$NUMA_NODE" bash -c "
 | 
				
			||||||
    set -e
 | 
					    set -e
 | 
				
			||||||
    pytest -x -s -v \
 | 
					    pytest -x -s -v \
 | 
				
			||||||
    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]"
 | 
					    tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  # Note: disable it until supports V1
 | 
					  # Note: disable it until supports V1
 | 
				
			||||||
  # Run AWQ test
 | 
					  # Run AWQ test
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										191
									
								
								.buildkite/scripts/hardware_ci/run-npu-test.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										191
									
								
								.buildkite/scripts/hardware_ci/run-npu-test.sh
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,191 @@
 | 
				
			|||||||
 | 
					#!/bin/bash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# This script build the Ascend NPU docker image and run the offline inference inside the container.
 | 
				
			||||||
 | 
					# It serves a sanity check for compilation and basic model usage.
 | 
				
			||||||
 | 
					set -ex
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Base ubuntu image with basic ascend development libraries and python installed
 | 
				
			||||||
 | 
					VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git"
 | 
				
			||||||
 | 
					CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg"
 | 
				
			||||||
 | 
					TEST_RUN_CONFIG_FILE="vllm_test.cfg"
 | 
				
			||||||
 | 
					VLLM_ASCEND_TMP_DIR=
 | 
				
			||||||
 | 
					# Get the test run configuration file from the vllm-ascend repository
 | 
				
			||||||
 | 
					fetch_vllm_test_cfg() {
 | 
				
			||||||
 | 
					    VLLM_ASCEND_TMP_DIR=$(mktemp -d)
 | 
				
			||||||
 | 
					    # Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval
 | 
				
			||||||
 | 
					    cleanup() {
 | 
				
			||||||
 | 
					        rm -rf "${VLLM_ASCEND_TMP_DIR}"
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					    trap cleanup EXIT
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}"
 | 
				
			||||||
 | 
					    if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then
 | 
				
			||||||
 | 
					        echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2
 | 
				
			||||||
 | 
					        exit 1
 | 
				
			||||||
 | 
					    fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # If the file already exists locally, just overwrite it
 | 
				
			||||||
 | 
					    cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}"
 | 
				
			||||||
 | 
					    echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources
 | 
				
			||||||
 | 
					    # when the trap is abnormal has been completed, so the temporary resources are manually deleted here.
 | 
				
			||||||
 | 
					    rm -rf "${VLLM_ASCEND_TMP_DIR}"
 | 
				
			||||||
 | 
					    trap - EXIT
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Downloads test run configuration file from a remote URL.
 | 
				
			||||||
 | 
					# Loads the configuration into the current script environment.
 | 
				
			||||||
 | 
					get_config() {
 | 
				
			||||||
 | 
					    if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then
 | 
				
			||||||
 | 
					        echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2
 | 
				
			||||||
 | 
					        exit 1
 | 
				
			||||||
 | 
					    fi
 | 
				
			||||||
 | 
					    source "${TEST_RUN_CONFIG_FILE}"
 | 
				
			||||||
 | 
					    echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}"
 | 
				
			||||||
 | 
					    return 0
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# get test running configuration.
 | 
				
			||||||
 | 
					fetch_vllm_test_cfg
 | 
				
			||||||
 | 
					get_config
 | 
				
			||||||
 | 
					# Check if the function call was successful. If not, exit the script.
 | 
				
			||||||
 | 
					if [ $? -ne 0 ]; then
 | 
				
			||||||
 | 
					  exit 1
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}"
 | 
				
			||||||
 | 
					container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards
 | 
				
			||||||
 | 
					agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}')
 | 
				
			||||||
 | 
					echo "agent_idx: ${agent_idx}"
 | 
				
			||||||
 | 
					builder_name="cachebuilder${agent_idx}"
 | 
				
			||||||
 | 
					builder_cache_dir="/mnt/docker-cache${agent_idx}"
 | 
				
			||||||
 | 
					mkdir -p ${builder_cache_dir}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Try building the docker image
 | 
				
			||||||
 | 
					cat <<EOF | DOCKER_BUILDKIT=1 docker build \
 | 
				
			||||||
 | 
					    --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \
 | 
				
			||||||
 | 
					    --builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \
 | 
				
			||||||
 | 
					                           --cache-to type=local,dest=${builder_cache_dir},mode=max \
 | 
				
			||||||
 | 
					    --progress=plain --load -t ${image_name} -f - .
 | 
				
			||||||
 | 
					FROM ${BASE_IMAGE_NAME}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Define environments
 | 
				
			||||||
 | 
					ENV DEBIAN_FRONTEND=noninteractive
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \
 | 
				
			||||||
 | 
					    pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \
 | 
				
			||||||
 | 
					    apt-get update -y && \
 | 
				
			||||||
 | 
					    apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
 | 
				
			||||||
 | 
					    rm -rf /var/cache/apt/* && \
 | 
				
			||||||
 | 
					    rm -rf /var/lib/apt/lists/*
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Install for pytest to make the docker build cache layer always valid
 | 
				
			||||||
 | 
					RUN --mount=type=cache,target=/root/.cache/pip \
 | 
				
			||||||
 | 
					    pip install pytest>=6.0  modelscope
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					WORKDIR /workspace/vllm
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
 | 
				
			||||||
 | 
					COPY requirements/common.txt /workspace/vllm/requirements/common.txt
 | 
				
			||||||
 | 
					RUN --mount=type=cache,target=/root/.cache/pip \
 | 
				
			||||||
 | 
					    pip install -r requirements/common.txt
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					COPY . .
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Install vLLM
 | 
				
			||||||
 | 
					RUN --mount=type=cache,target=/root/.cache/pip \
 | 
				
			||||||
 | 
					    VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
 | 
				
			||||||
 | 
					    python3 -m pip uninstall -y triton
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Install vllm-ascend
 | 
				
			||||||
 | 
					WORKDIR /workspace
 | 
				
			||||||
 | 
					ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git
 | 
				
			||||||
 | 
					ARG VLLM_ASCEND_TAG=main
 | 
				
			||||||
 | 
					RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \
 | 
				
			||||||
 | 
					    git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid.
 | 
				
			||||||
 | 
					RUN --mount=type=cache,target=/root/.cache/pip \
 | 
				
			||||||
 | 
					    pip install -r /workspace/vllm-ascend/requirements.txt
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					RUN --mount=type=cache,target=/root/.cache/pip \
 | 
				
			||||||
 | 
					    export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
 | 
				
			||||||
 | 
					    source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
 | 
				
			||||||
 | 
					    source /usr/local/Ascend/nnal/atb/set_env.sh && \
 | 
				
			||||||
 | 
					    export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
 | 
				
			||||||
 | 
					    python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
 | 
				
			||||||
 | 
					ENV VLLM_USE_MODELSCOPE=True
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					WORKDIR /workspace/vllm-ascend
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CMD ["/bin/bash"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					EOF
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Setup cleanup
 | 
				
			||||||
 | 
					remove_docker_container() {
 | 
				
			||||||
 | 
					  docker rm -f "${container_name}" || true;
 | 
				
			||||||
 | 
					  docker image rm -f "${image_name}" || true;
 | 
				
			||||||
 | 
					  docker system prune -f || true;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					trap remove_docker_container EXIT
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Generate corresponding --device args based on BUILDKITE_AGENT_NAME
 | 
				
			||||||
 | 
					# Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1.
 | 
				
			||||||
 | 
					#   e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards.
 | 
				
			||||||
 | 
					#   returns --device /dev/davinci0 --device /dev/davinci1
 | 
				
			||||||
 | 
					parse_and_gen_devices() {
 | 
				
			||||||
 | 
					    local input="$1"
 | 
				
			||||||
 | 
					    local index cards_num
 | 
				
			||||||
 | 
					    if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then
 | 
				
			||||||
 | 
					        index="${BASH_REMATCH[1]}"
 | 
				
			||||||
 | 
					        cards_num="${BASH_REMATCH[2]}"
 | 
				
			||||||
 | 
					    else
 | 
				
			||||||
 | 
					        echo "parse error" >&2
 | 
				
			||||||
 | 
					        return 1
 | 
				
			||||||
 | 
					    fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    local devices=""
 | 
				
			||||||
 | 
					    local i=0
 | 
				
			||||||
 | 
					    while (( i < cards_num )); do
 | 
				
			||||||
 | 
					        local dev_idx=$(((index - 1)*cards_num + i ))
 | 
				
			||||||
 | 
					        devices="$devices --device /dev/davinci${dev_idx}"
 | 
				
			||||||
 | 
					        ((i++))
 | 
				
			||||||
 | 
					    done
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # trim leading space
 | 
				
			||||||
 | 
					    devices="${devices#"${devices%%[![:space:]]*}"}"
 | 
				
			||||||
 | 
					    # Output devices: assigned to the caller variable
 | 
				
			||||||
 | 
					    printf '%s' "$devices"
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware.
 | 
				
			||||||
 | 
					# This test checks whether the OOT platform interface is functioning properly in conjunction with
 | 
				
			||||||
 | 
					# the hardware plugin vllm-ascend.
 | 
				
			||||||
 | 
					model_cache_dir=/mnt/modelscope${agent_idx}
 | 
				
			||||||
 | 
					mkdir -p ${model_cache_dir}
 | 
				
			||||||
 | 
					docker run \
 | 
				
			||||||
 | 
					    ${devices} \
 | 
				
			||||||
 | 
					    --device /dev/davinci_manager \
 | 
				
			||||||
 | 
					    --device /dev/devmm_svm \
 | 
				
			||||||
 | 
					    --device /dev/hisi_hdc \
 | 
				
			||||||
 | 
					    -v /usr/local/dcmi:/usr/local/dcmi \
 | 
				
			||||||
 | 
					    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
 | 
				
			||||||
 | 
					    -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
 | 
				
			||||||
 | 
					    -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
 | 
				
			||||||
 | 
					    -v /etc/ascend_install.info:/etc/ascend_install.info \
 | 
				
			||||||
 | 
					    -v ${model_cache_dir}:/root/.cache/modelscope \
 | 
				
			||||||
 | 
					    --entrypoint="" \
 | 
				
			||||||
 | 
					    --name "${container_name}" \
 | 
				
			||||||
 | 
					    "${image_name}" \
 | 
				
			||||||
 | 
					    bash -c '
 | 
				
			||||||
 | 
					    set -e
 | 
				
			||||||
 | 
					    pytest -v -s tests/e2e/vllm_interface/
 | 
				
			||||||
 | 
					'
 | 
				
			||||||
@ -62,12 +62,11 @@ echo "--- Installing Python dependencies ---"
 | 
				
			|||||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
 | 
					python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
 | 
				
			||||||
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
 | 
					    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
 | 
				
			||||||
    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
 | 
					    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
 | 
				
			||||||
    && python3 -m pip install --progress-bar off hf-transfer
 | 
					    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 | 
				
			||||||
echo "--- Python dependencies installed ---"
 | 
					echo "--- Python dependencies installed ---"
 | 
				
			||||||
export VLLM_USE_V1=1
 | 
					
 | 
				
			||||||
export VLLM_XLA_CHECK_RECOMPILATION=1
 | 
					export VLLM_XLA_CHECK_RECOMPILATION=1
 | 
				
			||||||
export VLLM_XLA_CACHE_PATH=
 | 
					export VLLM_XLA_CACHE_PATH=
 | 
				
			||||||
echo "Using VLLM V1"
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
echo "--- Hardware Information ---"
 | 
					echo "--- Hardware Information ---"
 | 
				
			||||||
# tpu-info
 | 
					# tpu-info
 | 
				
			||||||
 | 
				
			|||||||
@ -62,12 +62,11 @@ echo "--- Installing Python dependencies ---"
 | 
				
			|||||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
 | 
					python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
 | 
				
			||||||
    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
 | 
					    && python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
 | 
				
			||||||
    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
 | 
					    && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \
 | 
				
			||||||
    && python3 -m pip install --progress-bar off hf-transfer
 | 
					    && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0
 | 
				
			||||||
echo "--- Python dependencies installed ---"
 | 
					echo "--- Python dependencies installed ---"
 | 
				
			||||||
export VLLM_USE_V1=1
 | 
					
 | 
				
			||||||
export VLLM_XLA_CHECK_RECOMPILATION=1
 | 
					export VLLM_XLA_CHECK_RECOMPILATION=1
 | 
				
			||||||
export VLLM_XLA_CACHE_PATH=
 | 
					export VLLM_XLA_CACHE_PATH=
 | 
				
			||||||
echo "Using VLLM V1"
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
echo "--- Hardware Information ---"
 | 
					echo "--- Hardware Information ---"
 | 
				
			||||||
# tpu-info
 | 
					# tpu-info
 | 
				
			||||||
 | 
				
			|||||||
@ -35,16 +35,14 @@ docker run \
 | 
				
			|||||||
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
 | 
					    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 -O3 -O.cudagraph_mode=NONE
 | 
				
			||||||
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
 | 
					    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend ray
 | 
				
			||||||
    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
 | 
					    python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager -tp 2 --distributed-executor-backend mp
 | 
				
			||||||
    VLLM_ATTENTION_BACKEND=TRITON_ATTN_VLLM_V1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
 | 
					    VLLM_ATTENTION_BACKEND=TRITON_ATTN python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
 | 
				
			||||||
    cd tests
 | 
					    cd tests
 | 
				
			||||||
    pytest -v -s v1/core
 | 
					    pytest -v -s v1/core
 | 
				
			||||||
    pytest -v -s v1/engine
 | 
					    pytest -v -s v1/engine
 | 
				
			||||||
    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
 | 
					    pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py
 | 
				
			||||||
    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
 | 
					    pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py
 | 
				
			||||||
    pytest -v -s v1/structured_output
 | 
					    pytest -v -s v1/structured_output
 | 
				
			||||||
    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py
 | 
					    pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py
 | 
				
			||||||
    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
 | 
					    pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py
 | 
				
			||||||
    pytest -v -s v1/test_serial_utils.py
 | 
					    pytest -v -s v1/test_serial_utils.py
 | 
				
			||||||
    pytest -v -s v1/test_utils.py
 | 
					 | 
				
			||||||
    pytest -v -s v1/test_metrics_reader.py
 | 
					 | 
				
			||||||
'
 | 
					'
 | 
				
			||||||
 | 
				
			|||||||
@ -18,7 +18,7 @@ vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_
 | 
				
			|||||||
bench_throughput_exit_code=$?
 | 
					bench_throughput_exit_code=$?
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# run server-based benchmarks and upload the result to buildkite
 | 
					# run server-based benchmarks and upload the result to buildkite
 | 
				
			||||||
python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
 | 
					vllm serve meta-llama/Llama-2-7b-chat-hf &
 | 
				
			||||||
server_pid=$!
 | 
					server_pid=$!
 | 
				
			||||||
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 | 
					wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										59
									
								
								.buildkite/scripts/run-prime-rl-test.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										59
									
								
								.buildkite/scripts/run-prime-rl-test.sh
									
									
									
									
									
										Executable file
									
								
							@ -0,0 +1,59 @@
 | 
				
			|||||||
 | 
					#!/bin/bash
 | 
				
			||||||
 | 
					# SPDX-License-Identifier: Apache-2.0
 | 
				
			||||||
 | 
					# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Setup script for Prime-RL integration tests
 | 
				
			||||||
 | 
					# This script prepares the environment for running Prime-RL tests with nightly vLLM
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					set -euo pipefail
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 | 
				
			||||||
 | 
					REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"
 | 
				
			||||||
 | 
					PRIME_RL_REPO="https://github.com/PrimeIntellect-ai/prime-rl.git"
 | 
				
			||||||
 | 
					PRIME_RL_DIR="${REPO_ROOT}/prime-rl"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					echo "Setting up Prime-RL integration test environment..."
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Clean up any existing Prime-RL directory
 | 
				
			||||||
 | 
					if [ -d "${PRIME_RL_DIR}" ]; then
 | 
				
			||||||
 | 
					    echo "Removing existing Prime-RL directory..."
 | 
				
			||||||
 | 
					    rm -rf "${PRIME_RL_DIR}"
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Install UV if not available
 | 
				
			||||||
 | 
					if ! command -v uv &> /dev/null; then
 | 
				
			||||||
 | 
					    echo "Installing UV package manager..."
 | 
				
			||||||
 | 
					    curl -LsSf https://astral.sh/uv/install.sh | sh
 | 
				
			||||||
 | 
					    source $HOME/.local/bin/env
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Clone Prime-RL repository at specific branch for reproducible tests
 | 
				
			||||||
 | 
					PRIME_RL_BRANCH="integ-vllm-main"
 | 
				
			||||||
 | 
					echo "Cloning Prime-RL repository at branch: ${PRIME_RL_BRANCH}..."
 | 
				
			||||||
 | 
					git clone --branch "${PRIME_RL_BRANCH}" --single-branch "${PRIME_RL_REPO}" "${PRIME_RL_DIR}"
 | 
				
			||||||
 | 
					cd "${PRIME_RL_DIR}"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					echo "Setting up UV project environment..."
 | 
				
			||||||
 | 
					export UV_PROJECT_ENVIRONMENT=/usr/local
 | 
				
			||||||
 | 
					ln -s /usr/bin/python3 /usr/local/bin/python
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Remove vllm pin from pyproject.toml
 | 
				
			||||||
 | 
					echo "Removing vllm pin from pyproject.toml..."
 | 
				
			||||||
 | 
					sed -i '/vllm==/d' pyproject.toml
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Sync Prime-RL dependencies
 | 
				
			||||||
 | 
					echo "Installing Prime-RL dependencies..."
 | 
				
			||||||
 | 
					uv sync --inexact && uv sync --inexact --all-extras
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Verify installation
 | 
				
			||||||
 | 
					echo "Verifying installations..."
 | 
				
			||||||
 | 
					uv run python -c "import vllm; print(f'vLLM version: {vllm.__version__}')"
 | 
				
			||||||
 | 
					uv run python -c "import prime_rl; print('Prime-RL imported successfully')"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					echo "Prime-RL integration test environment setup complete!"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					echo "Running Prime-RL integration tests..."
 | 
				
			||||||
 | 
					export WANDB_MODE=offline # this makes this test not require a WANDB_API_KEY
 | 
				
			||||||
 | 
					uv run pytest -vs tests/integration/test_rl.py -m gpu
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					echo "Prime-RL integration tests completed!"
 | 
				
			||||||
@ -9,6 +9,6 @@ MAX_NUM_BATCHED_TOKENS=1024
 | 
				
			|||||||
TENSOR_PARALLEL_SIZE=1
 | 
					TENSOR_PARALLEL_SIZE=1
 | 
				
			||||||
MAX_MODEL_LEN=2048
 | 
					MAX_MODEL_LEN=2048
 | 
				
			||||||
DOWNLOAD_DIR=/mnt/disks/persist
 | 
					DOWNLOAD_DIR=/mnt/disks/persist
 | 
				
			||||||
EXPECTED_THROUGHPUT=10.0
 | 
					EXPECTED_THROUGHPUT=8.7
 | 
				
			||||||
INPUT_LEN=1800
 | 
					INPUT_LEN=1800
 | 
				
			||||||
OUTPUT_LEN=128
 | 
					OUTPUT_LEN=128
 | 
				
			||||||
 | 
				
			|||||||
@ -42,7 +42,7 @@ echo "lanching vllm..."
 | 
				
			|||||||
echo "logging to $VLLM_LOG"
 | 
					echo "logging to $VLLM_LOG"
 | 
				
			||||||
echo
 | 
					echo
 | 
				
			||||||
 | 
					
 | 
				
			||||||
VLLM_USE_V1=1 vllm serve $MODEL \
 | 
					vllm serve $MODEL \
 | 
				
			||||||
 --seed 42 \
 | 
					 --seed 42 \
 | 
				
			||||||
 --max-num-seqs $MAX_NUM_SEQS \
 | 
					 --max-num-seqs $MAX_NUM_SEQS \
 | 
				
			||||||
 --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
 | 
					 --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
 | 
				
			||||||
 | 
				
			|||||||
@ -58,33 +58,25 @@ python3 .buildkite/generate_index.py --wheel "$normal_wheel"
 | 
				
			|||||||
aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 | 
					aws s3 cp "$wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 | 
				
			||||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 | 
					aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if [[ $normal_wheel == *"cu126"* ]]; then
 | 
					if [[ $normal_wheel == *"cu129"* ]]; then
 | 
				
			||||||
    # if $normal_wheel matches cu126, do not upload the index.html
 | 
					 | 
				
			||||||
    echo "Skipping index files for cu126 wheels"
 | 
					 | 
				
			||||||
elif [[ $normal_wheel == *"cu128"* ]]; then
 | 
					 | 
				
			||||||
    # if $normal_wheel matches cu128, do not upload the index.html
 | 
					 | 
				
			||||||
    echo "Skipping index files for cu128 wheels"
 | 
					 | 
				
			||||||
else
 | 
					 | 
				
			||||||
    # only upload index.html for cu129 wheels (default wheels) as it
 | 
					    # only upload index.html for cu129 wheels (default wheels) as it
 | 
				
			||||||
    # is available on both x86 and arm64
 | 
					    # is available on both x86 and arm64
 | 
				
			||||||
    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
 | 
					    aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
 | 
				
			||||||
    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 | 
					    aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
 | 
				
			||||||
 | 
					else
 | 
				
			||||||
 | 
					    echo "Skipping index files for non-cu129 wheels"
 | 
				
			||||||
fi
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# generate index for nightly
 | 
					# generate index for nightly
 | 
				
			||||||
aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
 | 
					aws s3 cp "$wheel" "s3://vllm-wheels/nightly/"
 | 
				
			||||||
aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
 | 
					aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if [[ $normal_wheel == *"cu126"* ]]; then
 | 
					if [[ $normal_wheel == *"cu129"* ]]; then
 | 
				
			||||||
    # if $normal_wheel matches cu126, do not upload the index.html
 | 
					 | 
				
			||||||
    echo "Skipping index files for cu126 wheels"
 | 
					 | 
				
			||||||
elif [[ $normal_wheel == *"cu128"* ]]; then
 | 
					 | 
				
			||||||
    # if $normal_wheel matches cu128, do not upload the index.html
 | 
					 | 
				
			||||||
    echo "Skipping index files for cu128 wheels"
 | 
					 | 
				
			||||||
else
 | 
					 | 
				
			||||||
    # only upload index.html for cu129 wheels (default wheels) as it
 | 
					    # only upload index.html for cu129 wheels (default wheels) as it
 | 
				
			||||||
    # is available on both x86 and arm64
 | 
					    # is available on both x86 and arm64
 | 
				
			||||||
    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 | 
					    aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
 | 
				
			||||||
 | 
					else
 | 
				
			||||||
 | 
					    echo "Skipping index files for non-cu129 wheels"
 | 
				
			||||||
fi
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
 | 
					aws s3 cp "$wheel" "s3://vllm-wheels/$version/"
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										1319
									
								
								.buildkite/test-amd.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1319
									
								
								.buildkite/test-amd.yaml
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@ -6,24 +6,28 @@
 | 
				
			|||||||
# to generate the final pipeline yaml file.
 | 
					# to generate the final pipeline yaml file.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Documentation
 | 
					# Documentation
 | 
				
			||||||
# label(str): the name of the test. emoji allowed.
 | 
					# label(str): the name of the test. emojis allowed.
 | 
				
			||||||
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
 | 
					# fast_check(bool): whether to run this on each commit on the fastcheck pipeline.
 | 
				
			||||||
# torch_nightly(bool): whether to run this on vllm against torch nightly pipeline.
 | 
					# torch_nightly(bool): whether to run this on vllm against the torch nightly pipeline.
 | 
				
			||||||
# fast_check_only(bool): run this test on fastcheck pipeline only
 | 
					# fast_check_only(bool): run this test on the fastcheck pipeline only
 | 
				
			||||||
# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's scheduled nightly run.
 | 
					# optional(bool): never run this test by default (i.e. need to unblock manually) unless it's a scheduled nightly run.
 | 
				
			||||||
 | 
					# soft_fail(bool): allow this step to fail without failing the entire pipeline (useful for flaky or experimental tests).
 | 
				
			||||||
# command(str): the single command to run for tests. incompatible with commands.
 | 
					# command(str): the single command to run for tests. incompatible with commands.
 | 
				
			||||||
# commands(list): the list of commands to run for test. incompatbile with command.
 | 
					# commands(list): the list of commands to run for the test. incompatible with command.
 | 
				
			||||||
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
 | 
					# mirror_hardwares(list): the list of hardware to run the test on as well. currently only supports [amdexperimental]
 | 
				
			||||||
# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
 | 
					# gpu(str): override the GPU selection for the test. default is L4 GPUs. supports a100, b200, h200
 | 
				
			||||||
# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
 | 
					# num_gpus(int): override the number of GPUs for the test. defaults to 1 GPU. currently supports 2,4.
 | 
				
			||||||
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
 | 
					# num_nodes(int): whether to simulate multi-node setup by launching multiple containers on one host,
 | 
				
			||||||
#     in this case, commands must be specified. the first command runs on first host, the second
 | 
					#     in this case, commands must be specified. the first command runs on the first host, the second
 | 
				
			||||||
#     command runs on the second host.
 | 
					#     command runs on the second host.
 | 
				
			||||||
# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
 | 
					# timeout_in_minutes(int): sets a timeout for the step in minutes. if not specified, uses the default timeout.
 | 
				
			||||||
# source_file_dependencies(list): the list of prefix to opt-in the test for, if empty, the test will always run.
 | 
					# parallelism(int): number of parallel jobs to run for this step. enables test sharding using $$BUILDKITE_PARALLEL_JOB
 | 
				
			||||||
 | 
					#     and $$BUILDKITE_PARALLEL_JOB_COUNT environment variables.
 | 
				
			||||||
 | 
					# working_dir(str): specify the place where the command should execute, default to /vllm-workspace/tests
 | 
				
			||||||
 | 
					# source_file_dependencies(list): the list of prefixes to opt-in the test for, if empty, the test will always run.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# When adding a test
 | 
					# When adding a test
 | 
				
			||||||
# - If the test belong to an existing group, add it there
 | 
					# - If the test belongs to an existing group, add it there
 | 
				
			||||||
# - If the test is short, add to any existing step
 | 
					# - If the test is short, add to any existing step
 | 
				
			||||||
# - If the test takes more than 10min, then it is okay to create a new step.
 | 
					# - If the test takes more than 10min, then it is okay to create a new step.
 | 
				
			||||||
#   Note that all steps execute in parallel.
 | 
					#   Note that all steps execute in parallel.
 | 
				
			||||||
@ -46,25 +50,28 @@ steps:
 | 
				
			|||||||
  mirror_hardwares: [amdexperimental]
 | 
					  mirror_hardwares: [amdexperimental]
 | 
				
			||||||
  source_file_dependencies:
 | 
					  source_file_dependencies:
 | 
				
			||||||
  - vllm/
 | 
					  - vllm/
 | 
				
			||||||
  - tests/mq_llm_engine
 | 
					  - tests/multimodal
 | 
				
			||||||
  - tests/async_engine
 | 
					  - tests/utils_
 | 
				
			||||||
 | 
					  commands:
 | 
				
			||||||
 | 
					  - pytest -v -s -m 'not cpu_test' multimodal
 | 
				
			||||||
 | 
					  - pytest -v -s utils_
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins
 | 
				
			||||||
 | 
					  timeout_in_minutes: 10
 | 
				
			||||||
 | 
					  source_file_dependencies:
 | 
				
			||||||
 | 
					  - vllm/
 | 
				
			||||||
  - tests/test_inputs.py
 | 
					  - tests/test_inputs.py
 | 
				
			||||||
  - tests/test_outputs.py
 | 
					  - tests/test_outputs.py
 | 
				
			||||||
  - tests/multimodal
 | 
					  - tests/multimodal
 | 
				
			||||||
  - tests/utils_
 | 
					 | 
				
			||||||
  - tests/worker
 | 
					 | 
				
			||||||
  - tests/standalone_tests/lazy_imports.py
 | 
					  - tests/standalone_tests/lazy_imports.py
 | 
				
			||||||
  - tests/transformers_utils
 | 
					  - tests/transformers_utils
 | 
				
			||||||
 | 
					  no_gpu: true
 | 
				
			||||||
  commands:
 | 
					  commands:
 | 
				
			||||||
  - python3 standalone_tests/lazy_imports.py
 | 
					  - python3 standalone_tests/lazy_imports.py
 | 
				
			||||||
  - pytest -v -s mq_llm_engine # MQLLMEngine
 | 
					 | 
				
			||||||
  - pytest -v -s async_engine # AsyncLLMEngine
 | 
					 | 
				
			||||||
  - pytest -v -s test_inputs.py
 | 
					  - pytest -v -s test_inputs.py
 | 
				
			||||||
  - pytest -v -s test_outputs.py
 | 
					  - pytest -v -s test_outputs.py
 | 
				
			||||||
  - pytest -v -s multimodal
 | 
					  - pytest -v -s -m 'cpu_test' multimodal
 | 
				
			||||||
  - pytest -v -s utils_ # Utils
 | 
					  - pytest -v -s transformers_utils
 | 
				
			||||||
  - pytest -v -s worker # Worker
 | 
					 | 
				
			||||||
  - pytest -v -s transformers_utils # transformers_utils
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
- label: Python-only Installation Test # 10min
 | 
					- label: Python-only Installation Test # 10min
 | 
				
			||||||
  timeout_in_minutes: 20
 | 
					  timeout_in_minutes: 20
 | 
				
			||||||
@ -84,25 +91,12 @@ steps:
 | 
				
			|||||||
  - vllm/
 | 
					  - vllm/
 | 
				
			||||||
  - tests/basic_correctness/test_basic_correctness
 | 
					  - tests/basic_correctness/test_basic_correctness
 | 
				
			||||||
  - tests/basic_correctness/test_cpu_offload
 | 
					  - tests/basic_correctness/test_cpu_offload
 | 
				
			||||||
  - tests/basic_correctness/test_preemption
 | 
					 | 
				
			||||||
  - tests/basic_correctness/test_cumem.py
 | 
					  - tests/basic_correctness/test_cumem.py
 | 
				
			||||||
  commands:
 | 
					  commands:
 | 
				
			||||||
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
 | 
					  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
 | 
				
			||||||
  - pytest -v -s basic_correctness/test_cumem.py
 | 
					  - pytest -v -s basic_correctness/test_cumem.py
 | 
				
			||||||
  - pytest -v -s basic_correctness/test_basic_correctness.py
 | 
					  - pytest -v -s basic_correctness/test_basic_correctness.py
 | 
				
			||||||
  - pytest -v -s basic_correctness/test_cpu_offload.py
 | 
					  - pytest -v -s basic_correctness/test_cpu_offload.py
 | 
				
			||||||
  - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
- label: Core Test # 22min
 | 
					 | 
				
			||||||
  timeout_in_minutes: 35
 | 
					 | 
				
			||||||
  mirror_hardwares: [amdexperimental]
 | 
					 | 
				
			||||||
  fast_check: true
 | 
					 | 
				
			||||||
  source_file_dependencies:
 | 
					 | 
				
			||||||
  - vllm/core
 | 
					 | 
				
			||||||
  - vllm/distributed
 | 
					 | 
				
			||||||
  - tests/core
 | 
					 | 
				
			||||||
  commands:
 | 
					 | 
				
			||||||
  - pytest -v -s core
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
- label: Entrypoints Unit Tests # 5min
 | 
					- label: Entrypoints Unit Tests # 5min
 | 
				
			||||||
  timeout_in_minutes: 10
 | 
					  timeout_in_minutes: 10
 | 
				
			||||||
@ -127,10 +121,9 @@ steps:
 | 
				
			|||||||
  - tests/entrypoints/offline_mode
 | 
					  - tests/entrypoints/offline_mode
 | 
				
			||||||
  commands:
 | 
					  commands:
 | 
				
			||||||
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
 | 
					  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
 | 
				
			||||||
  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
 | 
					  - pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
 | 
				
			||||||
  - pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
 | 
					 | 
				
			||||||
  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
 | 
					  - pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
 | 
				
			||||||
  - VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 | 
					  - pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- label: Entrypoints Integration Test (API Server) # 100min
 | 
					- label: Entrypoints Integration Test (API Server) # 100min
 | 
				
			||||||
  timeout_in_minutes: 130
 | 
					  timeout_in_minutes: 130
 | 
				
			||||||
@ -168,7 +161,6 @@ steps:
 | 
				
			|||||||
  num_gpus: 4
 | 
					  num_gpus: 4
 | 
				
			||||||
  source_file_dependencies:
 | 
					  source_file_dependencies:
 | 
				
			||||||
  - vllm/distributed/
 | 
					  - vllm/distributed/
 | 
				
			||||||
  - vllm/core/
 | 
					 | 
				
			||||||
  - tests/distributed/test_utils
 | 
					  - tests/distributed/test_utils
 | 
				
			||||||
  - tests/distributed/test_pynccl
 | 
					  - tests/distributed/test_pynccl
 | 
				
			||||||
  - tests/distributed/test_events
 | 
					  - tests/distributed/test_events
 | 
				
			||||||
@ -176,28 +168,36 @@ steps:
 | 
				
			|||||||
  - examples/offline_inference/rlhf.py
 | 
					  - examples/offline_inference/rlhf.py
 | 
				
			||||||
  - examples/offline_inference/rlhf_colocate.py
 | 
					  - examples/offline_inference/rlhf_colocate.py
 | 
				
			||||||
  - tests/examples/offline_inference/data_parallel.py
 | 
					  - tests/examples/offline_inference/data_parallel.py
 | 
				
			||||||
  - tests/v1/test_async_llm_dp.py
 | 
					  - tests/v1/distributed
 | 
				
			||||||
  - tests/v1/test_external_lb_dp.py
 | 
					 | 
				
			||||||
  - tests/v1/test_internal_lb_dp.py
 | 
					 | 
				
			||||||
  - tests/v1/test_hybrid_lb_dp.py
 | 
					 | 
				
			||||||
  - tests/v1/engine/test_engine_core_client.py
 | 
					  - tests/v1/engine/test_engine_core_client.py
 | 
				
			||||||
 | 
					  - tests/distributed/test_symm_mem_allreduce.py
 | 
				
			||||||
  commands:
 | 
					  commands:
 | 
				
			||||||
  # test with tp=2 and external_dp=2
 | 
					  # https://github.com/NVIDIA/nccl/issues/1838
 | 
				
			||||||
  - VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
 | 
					  - export NCCL_CUMEM_HOST_ENABLE=0
 | 
				
			||||||
 | 
					  # test with torchrun tp=2 and external_dp=2
 | 
				
			||||||
  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
 | 
					  - torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
 | 
				
			||||||
  # test with tp=2 and pp=2
 | 
					  # test with torchrun tp=2 and pp=2
 | 
				
			||||||
  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
 | 
					  - PP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
 | 
				
			||||||
 | 
					  # test with torchrun tp=4 and dp=1
 | 
				
			||||||
 | 
					  - TP_SIZE=4 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
 | 
				
			||||||
 | 
					  # test with torchrun tp=2, pp=2 and dp=1
 | 
				
			||||||
 | 
					  - PP_SIZE=2 TP_SIZE=2 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
 | 
				
			||||||
 | 
					  # test with torchrun tp=1 and dp=4 with ep
 | 
				
			||||||
 | 
					  - DP_SIZE=4 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
 | 
				
			||||||
 | 
					  # test with torchrun tp=2 and dp=2 with ep
 | 
				
			||||||
 | 
					  - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py
 | 
				
			||||||
  # test with internal dp
 | 
					  # test with internal dp
 | 
				
			||||||
  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
 | 
					  - python3 ../examples/offline_inference/data_parallel.py --enforce-eager
 | 
				
			||||||
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
 | 
					  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
 | 
				
			||||||
  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
 | 
					  - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
 | 
				
			||||||
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py
 | 
					  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py
 | 
				
			||||||
  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py
 | 
					  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py
 | 
				
			||||||
  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
 | 
					  - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
 | 
				
			||||||
  - pytest -v -s distributed/test_utils.py
 | 
					  - pytest -v -s distributed/test_utils.py
 | 
				
			||||||
  - pytest -v -s compile/test_basic_correctness.py
 | 
					  - pytest -v -s compile/test_basic_correctness.py
 | 
				
			||||||
  - pytest -v -s distributed/test_pynccl.py
 | 
					  - pytest -v -s distributed/test_pynccl.py
 | 
				
			||||||
  - pytest -v -s distributed/test_events.py
 | 
					  - pytest -v -s distributed/test_events.py
 | 
				
			||||||
 | 
					  - pytest -v -s distributed/test_symm_mem_allreduce.py
 | 
				
			||||||
  # TODO: create a dedicated test section for multi-GPU example tests
 | 
					  # TODO: create a dedicated test section for multi-GPU example tests
 | 
				
			||||||
  # when we have multiple distributed example tests
 | 
					  # when we have multiple distributed example tests
 | 
				
			||||||
  - pushd ../examples/offline_inference
 | 
					  - pushd ../examples/offline_inference
 | 
				
			||||||
@ -230,16 +230,14 @@ steps:
 | 
				
			|||||||
  num_gpus: 2
 | 
					  num_gpus: 2
 | 
				
			||||||
  source_file_dependencies:
 | 
					  source_file_dependencies:
 | 
				
			||||||
  - vllm/
 | 
					  - vllm/
 | 
				
			||||||
  - tests/metrics
 | 
					 | 
				
			||||||
  - tests/v1/tracing
 | 
					  - tests/v1/tracing
 | 
				
			||||||
  commands:
 | 
					  commands:
 | 
				
			||||||
  - pytest -v -s metrics
 | 
					 | 
				
			||||||
  - "pip install \
 | 
					  - "pip install \
 | 
				
			||||||
      'opentelemetry-sdk>=1.26.0' \
 | 
					      'opentelemetry-sdk>=1.26.0' \
 | 
				
			||||||
      'opentelemetry-api>=1.26.0' \
 | 
					      'opentelemetry-api>=1.26.0' \
 | 
				
			||||||
      'opentelemetry-exporter-otlp>=1.26.0' \
 | 
					      'opentelemetry-exporter-otlp>=1.26.0' \
 | 
				
			||||||
      'opentelemetry-semantic-conventions-ai>=0.4.1'"
 | 
					      'opentelemetry-semantic-conventions-ai>=0.4.1'"
 | 
				
			||||||
  - pytest -v -s tracing
 | 
					  - pytest -v -s v1/tracing
 | 
				
			||||||
 | 
					
 | 
				
			||||||
##### fast check tests  #####
 | 
					##### fast check tests  #####
 | 
				
			||||||
#####  1 GPU test  #####
 | 
					#####  1 GPU test  #####
 | 
				
			||||||
@ -300,23 +298,35 @@ steps:
 | 
				
			|||||||
    - tests/v1
 | 
					    - tests/v1
 | 
				
			||||||
  commands:
 | 
					  commands:
 | 
				
			||||||
    # split the test to avoid interference
 | 
					    # split the test to avoid interference
 | 
				
			||||||
    - pytest -v -s v1/core
 | 
					    - pytest -v -s -m 'not cpu_test' v1/core
 | 
				
			||||||
    - pytest -v -s v1/executor
 | 
					    - pytest -v -s v1/executor
 | 
				
			||||||
 | 
					    - pytest -v -s v1/kv_offload
 | 
				
			||||||
    - pytest -v -s v1/sample
 | 
					    - pytest -v -s v1/sample
 | 
				
			||||||
    - pytest -v -s v1/logits_processors
 | 
					    - pytest -v -s v1/logits_processors
 | 
				
			||||||
    - pytest -v -s v1/worker
 | 
					    - pytest -v -s v1/worker
 | 
				
			||||||
    - pytest -v -s v1/structured_output
 | 
					 | 
				
			||||||
    - pytest -v -s v1/spec_decode
 | 
					    - pytest -v -s v1/spec_decode
 | 
				
			||||||
    - pytest -v -s v1/kv_connector/unit
 | 
					    - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit
 | 
				
			||||||
    - pytest -v -s v1/metrics
 | 
					    - pytest -v -s -m 'not cpu_test' v1/metrics
 | 
				
			||||||
    - pytest -v -s v1/test_serial_utils.py
 | 
					 | 
				
			||||||
    - pytest -v -s v1/test_utils.py
 | 
					 | 
				
			||||||
    - pytest -v -s v1/test_oracle.py
 | 
					    - pytest -v -s v1/test_oracle.py
 | 
				
			||||||
    - pytest -v -s v1/test_metrics_reader.py
 | 
					    - pytest -v -s v1/test_request.py
 | 
				
			||||||
    # Integration test for streaming correctness (requires special branch).
 | 
					    # Integration test for streaming correctness (requires special branch).
 | 
				
			||||||
    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
 | 
					    - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api
 | 
				
			||||||
    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 | 
					    - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- label: V1 Test others (CPU) # 5 mins
 | 
				
			||||||
 | 
					  source_file_dependencies:
 | 
				
			||||||
 | 
					    - vllm/
 | 
				
			||||||
 | 
					    - tests/v1
 | 
				
			||||||
 | 
					  no_gpu: true
 | 
				
			||||||
 | 
					  commands:
 | 
				
			||||||
 | 
					    # split the test to avoid interference
 | 
				
			||||||
 | 
					    - pytest -v -s -m 'cpu_test' v1/core
 | 
				
			||||||
 | 
					    - pytest -v -s v1/structured_output
 | 
				
			||||||
 | 
					    - pytest -v -s v1/test_serial_utils.py
 | 
				
			||||||
 | 
					    - pytest -v -s -m 'cpu_test' v1/kv_connector/unit
 | 
				
			||||||
 | 
					    - pytest -v -s -m 'cpu_test' v1/metrics
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- label: Examples Test # 30min
 | 
					- label: Examples Test # 30min
 | 
				
			||||||
  timeout_in_minutes: 45
 | 
					  timeout_in_minutes: 45
 | 
				
			||||||
  mirror_hardwares: [amdexperimental]
 | 
					  mirror_hardwares: [amdexperimental]
 | 
				
			||||||
@ -335,12 +345,14 @@ steps:
 | 
				
			|||||||
    - python3 offline_inference/vision_language.py --seed 0
 | 
					    - python3 offline_inference/vision_language.py --seed 0
 | 
				
			||||||
    - python3 offline_inference/vision_language_pooling.py --seed 0
 | 
					    - python3 offline_inference/vision_language_pooling.py --seed 0
 | 
				
			||||||
    - python3 offline_inference/vision_language_multi_image.py --seed 0
 | 
					    - python3 offline_inference/vision_language_multi_image.py --seed 0
 | 
				
			||||||
    - VLLM_USE_V1=0 python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
 | 
					    - python3 others/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 others/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
 | 
				
			||||||
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
 | 
					    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
 | 
				
			||||||
    - python3 offline_inference/basic/classify.py
 | 
					    - python3 offline_inference/basic/classify.py
 | 
				
			||||||
    - python3 offline_inference/basic/embed.py
 | 
					    - python3 offline_inference/basic/embed.py
 | 
				
			||||||
    - python3 offline_inference/basic/score.py
 | 
					    - python3 offline_inference/basic/score.py
 | 
				
			||||||
    - VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
 | 
					    - python3 offline_inference/spec_decode.py --test --method eagle --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 2048
 | 
				
			||||||
 | 
					    # https://github.com/vllm-project/vllm/pull/26682 uses slightly more memory in PyTorch 2.9+ causing this test to OOM in 1xL4 GPU
 | 
				
			||||||
 | 
					    - python3 offline_inference/spec_decode.py --test --method eagle3 --num_spec_tokens 3 --dataset-name hf --dataset-path philschmid/mt-bench --num-prompts 80 --temp 0 --top-p 1.0 --top-k -1 --tp 1 --enable-chunked-prefill --max-model-len 1536
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- label: Platform Tests (CUDA) # 4min
 | 
					- label: Platform Tests (CUDA) # 4min
 | 
				
			||||||
  timeout_in_minutes: 15
 | 
					  timeout_in_minutes: 15
 | 
				
			||||||
@ -375,7 +387,12 @@ steps:
 | 
				
			|||||||
      --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
 | 
					      --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
 | 
				
			||||||
      --ignore=lora/test_chatglm3_tp.py \
 | 
					      --ignore=lora/test_chatglm3_tp.py \
 | 
				
			||||||
      --ignore=lora/test_llama_tp.py \
 | 
					      --ignore=lora/test_llama_tp.py \
 | 
				
			||||||
      --ignore=lora/test_llm_with_multi_loras.py
 | 
					      --ignore=lora/test_llm_with_multi_loras.py \
 | 
				
			||||||
 | 
					      --ignore=lora/test_olmoe_tp.py \
 | 
				
			||||||
 | 
					      --ignore=lora/test_deepseekv2_tp.py \
 | 
				
			||||||
 | 
					      --ignore=lora/test_gptoss.py \
 | 
				
			||||||
 | 
					      --ignore=lora/test_qwen3moe_tp.py
 | 
				
			||||||
 | 
					      
 | 
				
			||||||
  parallelism: 4
 | 
					  parallelism: 4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- label: PyTorch Compilation Unit Tests # 15min
 | 
					- label: PyTorch Compilation Unit Tests # 15min
 | 
				
			||||||
@ -389,11 +406,12 @@ steps:
 | 
				
			|||||||
    - pytest -v -s compile/test_pass_manager.py
 | 
					    - pytest -v -s compile/test_pass_manager.py
 | 
				
			||||||
    - pytest -v -s compile/test_fusion.py
 | 
					    - pytest -v -s compile/test_fusion.py
 | 
				
			||||||
    - pytest -v -s compile/test_fusion_attn.py
 | 
					    - pytest -v -s compile/test_fusion_attn.py
 | 
				
			||||||
 | 
					    - pytest -v -s compile/test_functionalization.py
 | 
				
			||||||
    - pytest -v -s compile/test_silu_mul_quant_fusion.py
 | 
					    - pytest -v -s compile/test_silu_mul_quant_fusion.py
 | 
				
			||||||
    - pytest -v -s compile/test_sequence_parallelism.py
 | 
					 | 
				
			||||||
    - pytest -v -s compile/test_async_tp.py
 | 
					 | 
				
			||||||
    - pytest -v -s compile/test_fusion_all_reduce.py
 | 
					    - pytest -v -s compile/test_fusion_all_reduce.py
 | 
				
			||||||
    - pytest -v -s compile/test_decorator.py
 | 
					    - pytest -v -s compile/test_decorator.py
 | 
				
			||||||
 | 
					    - pytest -v -s compile/test_noop_elimination.py
 | 
				
			||||||
 | 
					    - pytest -v -s compile/test_aot_compile.py
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- label: PyTorch Fullgraph Smoke Test # 15min
 | 
					- label: PyTorch Fullgraph Smoke Test # 15min
 | 
				
			||||||
  timeout_in_minutes: 30
 | 
					  timeout_in_minutes: 30
 | 
				
			||||||
@ -406,8 +424,8 @@ steps:
 | 
				
			|||||||
  - pytest -v -s compile/test_basic_correctness.py
 | 
					  - pytest -v -s compile/test_basic_correctness.py
 | 
				
			||||||
  - pytest -v -s compile/piecewise/
 | 
					  - pytest -v -s compile/piecewise/
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- label: PyTorch Fullgraph Test # 20min
 | 
					- label: PyTorch Fullgraph Test # 22min
 | 
				
			||||||
  timeout_in_minutes: 30
 | 
					  timeout_in_minutes: 35
 | 
				
			||||||
  mirror_hardwares: [amdexperimental]
 | 
					  mirror_hardwares: [amdexperimental]
 | 
				
			||||||
  torch_nightly: true
 | 
					  torch_nightly: true
 | 
				
			||||||
  source_file_dependencies:
 | 
					  source_file_dependencies:
 | 
				
			||||||
@ -415,6 +433,7 @@ steps:
 | 
				
			|||||||
  - tests/compile
 | 
					  - tests/compile
 | 
				
			||||||
  commands:
 | 
					  commands:
 | 
				
			||||||
  - pytest -v -s compile/test_full_graph.py
 | 
					  - pytest -v -s compile/test_full_graph.py
 | 
				
			||||||
 | 
					  - pytest -v -s compile/test_fusions_e2e.py
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- label: Kernels Core Operation Test # 48min
 | 
					- label: Kernels Core Operation Test # 48min
 | 
				
			||||||
  timeout_in_minutes: 75
 | 
					  timeout_in_minutes: 75
 | 
				
			||||||
@ -422,8 +441,9 @@ steps:
 | 
				
			|||||||
  source_file_dependencies:
 | 
					  source_file_dependencies:
 | 
				
			||||||
  - csrc/
 | 
					  - csrc/
 | 
				
			||||||
  - tests/kernels/core
 | 
					  - tests/kernels/core
 | 
				
			||||||
 | 
					  - tests/kernels/test_top_k_per_row.py
 | 
				
			||||||
  commands:
 | 
					  commands:
 | 
				
			||||||
    - pytest -v -s kernels/core
 | 
					    - pytest -v -s kernels/core kernels/test_top_k_per_row.py
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- label: Kernels Attention Test %N # 23min
 | 
					- label: Kernels Attention Test %N # 23min
 | 
				
			||||||
  timeout_in_minutes: 35
 | 
					  timeout_in_minutes: 35
 | 
				
			||||||
@ -467,32 +487,22 @@ steps:
 | 
				
			|||||||
  source_file_dependencies:
 | 
					  source_file_dependencies:
 | 
				
			||||||
  - csrc/mamba/
 | 
					  - csrc/mamba/
 | 
				
			||||||
  - tests/kernels/mamba
 | 
					  - tests/kernels/mamba
 | 
				
			||||||
 | 
					  - vllm/model_executor/layers/mamba/ops
 | 
				
			||||||
  commands:
 | 
					  commands:
 | 
				
			||||||
    - pytest -v -s kernels/mamba
 | 
					    - pytest -v -s kernels/mamba
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- label: Tensorizer Test # 14min
 | 
					- label: Model Executor Test # 23min
 | 
				
			||||||
  timeout_in_minutes: 25
 | 
					  timeout_in_minutes: 35
 | 
				
			||||||
  mirror_hardwares: [amdexperimental]
 | 
					 | 
				
			||||||
  source_file_dependencies:
 | 
					 | 
				
			||||||
  - vllm/model_executor/model_loader
 | 
					 | 
				
			||||||
  - tests/tensorizer_loader
 | 
					 | 
				
			||||||
  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
 | 
					 | 
				
			||||||
  commands:
 | 
					 | 
				
			||||||
    - apt-get update && apt-get install -y curl libsodium23
 | 
					 | 
				
			||||||
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
 | 
					 | 
				
			||||||
    - pytest -v -s tensorizer_loader
 | 
					 | 
				
			||||||
    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
- label: Model Executor Test # 7min
 | 
					 | 
				
			||||||
  timeout_in_minutes: 20
 | 
					 | 
				
			||||||
  mirror_hardwares: [amdexperimental]
 | 
					  mirror_hardwares: [amdexperimental]
 | 
				
			||||||
  source_file_dependencies:
 | 
					  source_file_dependencies:
 | 
				
			||||||
  - vllm/model_executor
 | 
					  - vllm/model_executor
 | 
				
			||||||
  - tests/model_executor
 | 
					  - tests/model_executor
 | 
				
			||||||
 | 
					  - tests/entrypoints/openai/test_tensorizer_entrypoint.py
 | 
				
			||||||
  commands:
 | 
					  commands:
 | 
				
			||||||
    - apt-get update && apt-get install -y curl libsodium23
 | 
					    - apt-get update && apt-get install -y curl libsodium23
 | 
				
			||||||
    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
 | 
					    - export VLLM_WORKER_MULTIPROC_METHOD=spawn
 | 
				
			||||||
    - pytest -v -s model_executor
 | 
					    - pytest -v -s model_executor
 | 
				
			||||||
 | 
					    - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- label: Benchmarks # 11min
 | 
					- label: Benchmarks # 11min
 | 
				
			||||||
  timeout_in_minutes: 20
 | 
					  timeout_in_minutes: 20
 | 
				
			||||||
@ -526,8 +536,9 @@ steps:
 | 
				
			|||||||
  # since torchao nightly is only compatible with torch nightly currently
 | 
					  # since torchao nightly is only compatible with torch nightly currently
 | 
				
			||||||
  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
 | 
					  # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now
 | 
				
			||||||
  # we can only upgrade after this is resolved
 | 
					  # we can only upgrade after this is resolved
 | 
				
			||||||
  - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128
 | 
					  # TODO(jerryzh168): resolve the above comment
 | 
				
			||||||
  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
 | 
					  - uv pip install --system torchao==0.13.0 --index-url https://download.pytorch.org/whl/cu129
 | 
				
			||||||
 | 
					  - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- label: LM Eval Small Models # 53min
 | 
					- label: LM Eval Small Models # 53min
 | 
				
			||||||
  timeout_in_minutes: 75
 | 
					  timeout_in_minutes: 75
 | 
				
			||||||
@ -548,15 +559,6 @@ steps:
 | 
				
			|||||||
  commands: # LMEval+Transcription WER check
 | 
					  commands: # LMEval+Transcription WER check
 | 
				
			||||||
  - pytest -s entrypoints/openai/correctness/
 | 
					  - pytest -s entrypoints/openai/correctness/
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- label: Encoder Decoder tests # 12min
 | 
					 | 
				
			||||||
  timeout_in_minutes: 20
 | 
					 | 
				
			||||||
  mirror_hardwares: [amdexperimental]
 | 
					 | 
				
			||||||
  source_file_dependencies:
 | 
					 | 
				
			||||||
  - vllm/
 | 
					 | 
				
			||||||
  - tests/encoder_decoder
 | 
					 | 
				
			||||||
  commands:
 | 
					 | 
				
			||||||
    - pytest -v -s encoder_decoder
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
- label: OpenAI-Compatible Tool Use # 23 min
 | 
					- label: OpenAI-Compatible Tool Use # 23 min
 | 
				
			||||||
  timeout_in_minutes: 35
 | 
					  timeout_in_minutes: 35
 | 
				
			||||||
  mirror_hardwares: [amdexperimental]
 | 
					  mirror_hardwares: [amdexperimental]
 | 
				
			||||||
@ -564,43 +566,105 @@ steps:
 | 
				
			|||||||
  source_file_dependencies:
 | 
					  source_file_dependencies:
 | 
				
			||||||
    - vllm/
 | 
					    - vllm/
 | 
				
			||||||
    - tests/tool_use
 | 
					    - tests/tool_use
 | 
				
			||||||
    - tests/mistral_tool_use
 | 
					 | 
				
			||||||
  commands:
 | 
					  commands:
 | 
				
			||||||
    - pytest -v -s tool_use
 | 
					    - pytest -v -s -m 'not cpu_test' tool_use
 | 
				
			||||||
    - pytest -v -s mistral_tool_use
 | 
					
 | 
				
			||||||
 | 
					- label: OpenAI-Compatible Tool Use (CPU) # 5 mins
 | 
				
			||||||
 | 
					  timeout_in_minutes: 10
 | 
				
			||||||
 | 
					  source_file_dependencies:
 | 
				
			||||||
 | 
					    - vllm/
 | 
				
			||||||
 | 
					    - tests/tool_use
 | 
				
			||||||
 | 
					  no_gpu: true
 | 
				
			||||||
 | 
					  commands:
 | 
				
			||||||
 | 
					    - pytest -v -s -m 'cpu_test' tool_use
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#####  models test  #####
 | 
					#####  models test  #####
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- label: Basic Models Test # 57min
 | 
					- label: Basic Models Tests (Initialization)
 | 
				
			||||||
  timeout_in_minutes: 75
 | 
					  timeout_in_minutes: 45
 | 
				
			||||||
  mirror_hardwares: [amdexperimental]
 | 
					  mirror_hardwares: [amdexperimental]
 | 
				
			||||||
  torch_nightly: true
 | 
					  torch_nightly: true
 | 
				
			||||||
  source_file_dependencies:
 | 
					  source_file_dependencies:
 | 
				
			||||||
  - vllm/
 | 
					  - vllm/
 | 
				
			||||||
  - tests/models
 | 
					  - tests/models/test_initialization.py
 | 
				
			||||||
  commands:
 | 
					  commands:
 | 
				
			||||||
    - pytest -v -s models/test_transformers.py
 | 
					    # Run a subset of model initialization tests
 | 
				
			||||||
    - pytest -v -s models/test_registry.py
 | 
					    - pytest -v -s models/test_initialization.py::test_can_initialize_small_subset
 | 
				
			||||||
    - pytest -v -s models/test_utils.py
 | 
					 | 
				
			||||||
    - pytest -v -s models/test_vision.py
 | 
					 | 
				
			||||||
    - pytest -v -s models/test_initialization.py
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
- label: Language Models Test (Standard) # 35min
 | 
					- label: Basic Models Tests (Extra Initialization) %N
 | 
				
			||||||
  timeout_in_minutes: 45
 | 
					  timeout_in_minutes: 45
 | 
				
			||||||
  mirror_hardwares: [amdexperimental]
 | 
					  mirror_hardwares: [amdexperimental]
 | 
				
			||||||
  torch_nightly: true
 | 
					  torch_nightly: true
 | 
				
			||||||
  source_file_dependencies:
 | 
					  source_file_dependencies:
 | 
				
			||||||
 | 
					  - vllm/model_executor/models/
 | 
				
			||||||
 | 
					  - tests/models/test_initialization.py
 | 
				
			||||||
 | 
					  commands:
 | 
				
			||||||
 | 
					    # Only when vLLM model source is modified - test initialization of a large
 | 
				
			||||||
 | 
					    # subset of supported models (the complement of the small subset in the above
 | 
				
			||||||
 | 
					    # test.) Also run if model initialization test file is modified
 | 
				
			||||||
 | 
					    - pytest -v -s models/test_initialization.py \
 | 
				
			||||||
 | 
					             -k 'not test_can_initialize_small_subset' \
 | 
				
			||||||
 | 
					             --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
 | 
				
			||||||
 | 
					             --shard-id=$$BUILDKITE_PARALLEL_JOB
 | 
				
			||||||
 | 
					  parallelism: 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- label: Basic Models Tests (Other)
 | 
				
			||||||
 | 
					  timeout_in_minutes: 45
 | 
				
			||||||
 | 
					  mirror_hardwares: [amdexperimental]
 | 
				
			||||||
 | 
					  torch_nightly: true
 | 
				
			||||||
 | 
					  source_file_dependencies:
 | 
				
			||||||
 | 
					  - vllm/
 | 
				
			||||||
 | 
					  - tests/models/test_transformers.py
 | 
				
			||||||
 | 
					  - tests/models/test_registry.py
 | 
				
			||||||
 | 
					  commands:
 | 
				
			||||||
 | 
					    - pytest -v -s models/test_transformers.py models/test_registry.py
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- label: Basic Models Test (Other CPU) # 5min
 | 
				
			||||||
 | 
					  timeout_in_minutes: 10
 | 
				
			||||||
 | 
					  torch_nightly: true
 | 
				
			||||||
 | 
					  source_file_dependencies:
 | 
				
			||||||
 | 
					  - vllm/
 | 
				
			||||||
 | 
					  - tests/models/test_utils.py
 | 
				
			||||||
 | 
					  - tests/models/test_vision.py
 | 
				
			||||||
 | 
					  no_gpu: true
 | 
				
			||||||
 | 
					  commands:
 | 
				
			||||||
 | 
					    - pytest -v -s models/test_utils.py models/test_vision.py
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- label: Language Models Tests (Standard)
 | 
				
			||||||
 | 
					  timeout_in_minutes: 25
 | 
				
			||||||
 | 
					  mirror_hardwares: [amdexperimental]
 | 
				
			||||||
 | 
					  torch_nightly: true
 | 
				
			||||||
 | 
					  source_file_dependencies:
 | 
				
			||||||
  - vllm/
 | 
					  - vllm/
 | 
				
			||||||
  - tests/models/language
 | 
					  - tests/models/language
 | 
				
			||||||
  commands:
 | 
					  commands:
 | 
				
			||||||
 | 
					    # Test standard language models, excluding a subset of slow tests
 | 
				
			||||||
    - pip freeze | grep -E 'torch'
 | 
					    - pip freeze | grep -E 'torch'
 | 
				
			||||||
    - pytest -v -s models/language -m core_model
 | 
					    - pytest -v -s models/language -m 'core_model and (not slow_test)'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- label: Language Models Test (Hybrid) # 35 min
 | 
					- label: Language Models Tests (Extra Standard) %N
 | 
				
			||||||
  timeout_in_minutes: 45
 | 
					  timeout_in_minutes: 45
 | 
				
			||||||
  mirror_hardwares: [amdexperimental]
 | 
					  mirror_hardwares: [amdexperimental]
 | 
				
			||||||
  torch_nightly: true
 | 
					  torch_nightly: true
 | 
				
			||||||
  source_file_dependencies:
 | 
					  source_file_dependencies:
 | 
				
			||||||
 | 
					  - vllm/model_executor/models/
 | 
				
			||||||
 | 
					  - tests/models/language/pooling/test_embedding.py
 | 
				
			||||||
 | 
					  - tests/models/language/generation/test_common.py
 | 
				
			||||||
 | 
					  - tests/models/language/pooling/test_classification.py
 | 
				
			||||||
 | 
					  commands:
 | 
				
			||||||
 | 
					    # Shard slow subset of standard language models tests. Only run when model
 | 
				
			||||||
 | 
					    # source is modified, or when specified test files are modified
 | 
				
			||||||
 | 
					    - pip freeze | grep -E 'torch'
 | 
				
			||||||
 | 
					    - pytest -v -s models/language -m 'core_model and slow_test' \
 | 
				
			||||||
 | 
					             --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
 | 
				
			||||||
 | 
					             --shard-id=$$BUILDKITE_PARALLEL_JOB
 | 
				
			||||||
 | 
					  parallelism: 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- label: Language Models Tests (Hybrid) %N
 | 
				
			||||||
 | 
					  timeout_in_minutes: 75
 | 
				
			||||||
 | 
					  mirror_hardwares: [amdexperimental]
 | 
				
			||||||
 | 
					  torch_nightly: true
 | 
				
			||||||
 | 
					  source_file_dependencies:
 | 
				
			||||||
  - vllm/
 | 
					  - vllm/
 | 
				
			||||||
  - tests/models/language/generation
 | 
					  - tests/models/language/generation
 | 
				
			||||||
  commands:
 | 
					  commands:
 | 
				
			||||||
@ -608,7 +672,12 @@ steps:
 | 
				
			|||||||
    # Note: also needed to run plamo2 model in vLLM
 | 
					    # Note: also needed to run plamo2 model in vLLM
 | 
				
			||||||
    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
 | 
					    - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
 | 
				
			||||||
    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
 | 
					    - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
 | 
				
			||||||
    - pytest -v -s models/language/generation -m hybrid_model
 | 
					    # Shard hybrid language model tests
 | 
				
			||||||
 | 
					    - pytest -v -s models/language/generation \
 | 
				
			||||||
 | 
					                   -m hybrid_model \
 | 
				
			||||||
 | 
					                   --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \
 | 
				
			||||||
 | 
					                   --shard-id=$$BUILDKITE_PARALLEL_JOB
 | 
				
			||||||
 | 
					  parallelism: 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- label: Language Models Test (Extended Generation) # 80min
 | 
					- label: Language Models Test (Extended Generation) # 80min
 | 
				
			||||||
  timeout_in_minutes: 110
 | 
					  timeout_in_minutes: 110
 | 
				
			||||||
@ -674,6 +743,16 @@ steps:
 | 
				
			|||||||
    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
 | 
					    - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing
 | 
				
			||||||
    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 | 
					    - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- label: Multi-Modal Accuracy Eval (Small Models) # 50min
 | 
				
			||||||
 | 
					  timeout_in_minutes: 70
 | 
				
			||||||
 | 
					  working_dir: "/vllm-workspace/.buildkite/lm-eval-harness"
 | 
				
			||||||
 | 
					  source_file_dependencies:
 | 
				
			||||||
 | 
					  - vllm/multimodal/
 | 
				
			||||||
 | 
					  - vllm/inputs/
 | 
				
			||||||
 | 
					  - vllm/v1/core/
 | 
				
			||||||
 | 
					  commands:
 | 
				
			||||||
 | 
					  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- label: Multi-Modal Models Test (Extended) 1
 | 
					- label: Multi-Modal Models Test (Extended) 1
 | 
				
			||||||
  mirror_hardwares: [amdexperimental]
 | 
					  mirror_hardwares: [amdexperimental]
 | 
				
			||||||
  optional: true
 | 
					  optional: true
 | 
				
			||||||
@ -729,14 +808,16 @@ steps:
 | 
				
			|||||||
  commands:
 | 
					  commands:
 | 
				
			||||||
    - pip install --upgrade git+https://github.com/huggingface/transformers
 | 
					    - pip install --upgrade git+https://github.com/huggingface/transformers
 | 
				
			||||||
    - pytest -v -s tests/models/test_initialization.py
 | 
					    - pytest -v -s tests/models/test_initialization.py
 | 
				
			||||||
 | 
					    - pytest -v -s tests/models/test_transformers.py
 | 
				
			||||||
    - pytest -v -s tests/models/multimodal/processing/
 | 
					    - pytest -v -s tests/models/multimodal/processing/
 | 
				
			||||||
    - pytest -v -s tests/models/multimodal/test_mapping.py
 | 
					    - pytest -v -s tests/models/multimodal/test_mapping.py
 | 
				
			||||||
    - python3 examples/offline_inference/basic/chat.py
 | 
					    - python3 examples/offline_inference/basic/chat.py
 | 
				
			||||||
    - python3 examples/offline_inference/audio_language.py --model-type whisper
 | 
					 | 
				
			||||||
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
 | 
					    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
 | 
				
			||||||
 | 
					    # Whisper needs spawn method to avoid deadlock
 | 
				
			||||||
 | 
					    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- label: Blackwell Test # 38 min
 | 
					- label: Blackwell Test # 21 min
 | 
				
			||||||
  timeout_in_minutes: 60
 | 
					  timeout_in_minutes: 30
 | 
				
			||||||
  working_dir: "/vllm-workspace/"
 | 
					  working_dir: "/vllm-workspace/"
 | 
				
			||||||
  gpu: b200
 | 
					  gpu: b200
 | 
				
			||||||
  # optional: true
 | 
					  # optional: true
 | 
				
			||||||
@ -749,8 +830,6 @@ steps:
 | 
				
			|||||||
  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
 | 
					  - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
 | 
				
			||||||
  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
 | 
					  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
 | 
				
			||||||
  - vllm/v1/attention/backends/flashinfer.py
 | 
					  - vllm/v1/attention/backends/flashinfer.py
 | 
				
			||||||
  - vllm/compilation/fusion.py
 | 
					 | 
				
			||||||
  - vllm/compilation/fusion_attn.py
 | 
					 | 
				
			||||||
  commands:
 | 
					  commands:
 | 
				
			||||||
    - nvidia-smi
 | 
					    - nvidia-smi
 | 
				
			||||||
    - python3 examples/offline_inference/basic/chat.py
 | 
					    - python3 examples/offline_inference/basic/chat.py
 | 
				
			||||||
@ -763,17 +842,77 @@ steps:
 | 
				
			|||||||
    # Quantization
 | 
					    # Quantization
 | 
				
			||||||
    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
 | 
					    - pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
 | 
				
			||||||
    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
 | 
					    - pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
 | 
				
			||||||
    - pytest -v -s tests/kernels/quantization/test_silu_nvfp4_quant_fusion.py
 | 
					    - pytest -v -s tests/kernels/quantization/test_silu_mul_nvfp4_quant.py
 | 
				
			||||||
    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
 | 
					    - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
 | 
				
			||||||
    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
 | 
					    - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py
 | 
				
			||||||
    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
 | 
					    - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
 | 
				
			||||||
 | 
					    - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py
 | 
				
			||||||
 | 
					    - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py
 | 
				
			||||||
    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
 | 
					    - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
 | 
				
			||||||
    - pytest -v -s tests/kernels/moe/test_mxfp4_moe.py
 | 
					    - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py
 | 
				
			||||||
    # Fusion
 | 
					 | 
				
			||||||
    - pytest -v -s tests/compile/test_fusion_all_reduce.py
 | 
					 | 
				
			||||||
    - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern
 | 
					 | 
				
			||||||
    - pytest -v -s tests/kernels/moe/test_flashinfer.py
 | 
					    - pytest -v -s tests/kernels/moe/test_flashinfer.py
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- label: Blackwell Fusion Tests # 30 min
 | 
				
			||||||
 | 
					  timeout_in_minutes: 40
 | 
				
			||||||
 | 
					  working_dir: "/vllm-workspace/"
 | 
				
			||||||
 | 
					  gpu: b200
 | 
				
			||||||
 | 
					  source_file_dependencies:
 | 
				
			||||||
 | 
					  - csrc/quantization/fp4/
 | 
				
			||||||
 | 
					  - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py
 | 
				
			||||||
 | 
					  - vllm/v1/attention/backends/flashinfer.py
 | 
				
			||||||
 | 
					  - vllm/compilation/
 | 
				
			||||||
 | 
					  # can affect pattern matching
 | 
				
			||||||
 | 
					  - vllm/model_executor/layers/layernorm.py
 | 
				
			||||||
 | 
					  - vllm/model_executor/layers/activation.py
 | 
				
			||||||
 | 
					  - vllm/model_executor/layers/quantization/input_quant_fp8.py
 | 
				
			||||||
 | 
					  commands:
 | 
				
			||||||
 | 
					    - nvidia-smi
 | 
				
			||||||
 | 
					    - pytest -v -s tests/compile/test_fusion_attn.py
 | 
				
			||||||
    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
 | 
					    - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py
 | 
				
			||||||
 | 
					    # this runner has 2 GPUs available even though num_gpus=2 is not set
 | 
				
			||||||
 | 
					    - pytest -v -s tests/compile/test_fusion_all_reduce.py
 | 
				
			||||||
 | 
					    - pytest -v -s tests/compile/test_fusions_e2e.py
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- label: Blackwell GPT-OSS Eval
 | 
				
			||||||
 | 
					  timeout_in_minutes: 60
 | 
				
			||||||
 | 
					  working_dir: "/vllm-workspace/"
 | 
				
			||||||
 | 
					  gpu: b200
 | 
				
			||||||
 | 
					  optional: true # run on nightlies
 | 
				
			||||||
 | 
					  source_file_dependencies:
 | 
				
			||||||
 | 
					  - tests/evals/gpt_oss
 | 
				
			||||||
 | 
					  - vllm/model_executor/models/gpt_oss.py
 | 
				
			||||||
 | 
					  - vllm/model_executor/layers/quantization/mxfp4.py
 | 
				
			||||||
 | 
					  - vllm/v1/attention/backends/flashinfer.py
 | 
				
			||||||
 | 
					  commands:
 | 
				
			||||||
 | 
					    - uv pip install --system 'gpt-oss[eval]==0.0.5'
 | 
				
			||||||
 | 
					    - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- label: Blackwell Quantized MoE Test
 | 
				
			||||||
 | 
					  timeout_in_minutes: 60
 | 
				
			||||||
 | 
					  working_dir: "/vllm-workspace/"
 | 
				
			||||||
 | 
					  gpu: b200
 | 
				
			||||||
 | 
					  source_file_dependencies:
 | 
				
			||||||
 | 
					  - tests/quantization/test_blackwell_moe.py
 | 
				
			||||||
 | 
					  - vllm/model_executor/models/deepseek_v2.py
 | 
				
			||||||
 | 
					  - vllm/model_executor/models/gpt_oss.py
 | 
				
			||||||
 | 
					  - vllm/model_executor/models/llama4.py
 | 
				
			||||||
 | 
					  - vllm/model_executor/layers/fused_moe
 | 
				
			||||||
 | 
					  - vllm/model_executor/layers/quantization/compressed_tensors
 | 
				
			||||||
 | 
					  - vllm/model_executor/layers/quantization/modelopt.py
 | 
				
			||||||
 | 
					  - vllm/model_executor/layers/quantization/mxfp4.py
 | 
				
			||||||
 | 
					  - vllm/v1/attention/backends/flashinfer.py
 | 
				
			||||||
 | 
					  commands:
 | 
				
			||||||
 | 
					    - pytest -s -v tests/quantization/test_blackwell_moe.py
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- label: Blackwell LM Eval Small Models
 | 
				
			||||||
 | 
					  timeout_in_minutes: 120
 | 
				
			||||||
 | 
					  gpu: b200
 | 
				
			||||||
 | 
					  optional: true # run on nightlies
 | 
				
			||||||
 | 
					  source_file_dependencies:
 | 
				
			||||||
 | 
					  - csrc/
 | 
				
			||||||
 | 
					  - vllm/model_executor/layers/quantization
 | 
				
			||||||
 | 
					  commands:
 | 
				
			||||||
 | 
					  - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#####  1 GPU test  #####
 | 
					#####  1 GPU test  #####
 | 
				
			||||||
#####  multi gpus test  #####
 | 
					#####  multi gpus test  #####
 | 
				
			||||||
@ -789,6 +928,8 @@ steps:
 | 
				
			|||||||
  commands:
 | 
					  commands:
 | 
				
			||||||
  - pytest -v -s distributed/test_comm_ops.py
 | 
					  - pytest -v -s distributed/test_comm_ops.py
 | 
				
			||||||
  - pytest -v -s distributed/test_shm_broadcast.py
 | 
					  - pytest -v -s distributed/test_shm_broadcast.py
 | 
				
			||||||
 | 
					  - pytest -v -s distributed/test_shm_buffer.py
 | 
				
			||||||
 | 
					  - pytest -v -s distributed/test_shm_storage.py
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- label: 2 Node Tests (4 GPUs in total) # 16min
 | 
					- label: 2 Node Tests (4 GPUs in total) # 16min
 | 
				
			||||||
  timeout_in_minutes: 30
 | 
					  timeout_in_minutes: 30
 | 
				
			||||||
@ -815,47 +956,61 @@ steps:
 | 
				
			|||||||
    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
 | 
					    - NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
 | 
				
			||||||
    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
 | 
					    - python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- label: Distributed Tests (2 GPUs) # 110min
 | 
					- label: Distributed Tests (2 GPUs) # 68min
 | 
				
			||||||
  timeout_in_minutes: 150
 | 
					  timeout_in_minutes: 90
 | 
				
			||||||
  mirror_hardwares: [amdexperimental]
 | 
					  mirror_hardwares: [amdexperimental]
 | 
				
			||||||
  working_dir: "/vllm-workspace/tests"
 | 
					  working_dir: "/vllm-workspace/tests"
 | 
				
			||||||
  num_gpus: 2
 | 
					  num_gpus: 2
 | 
				
			||||||
  source_file_dependencies:
 | 
					  source_file_dependencies:
 | 
				
			||||||
 | 
					  - vllm/compilation/
 | 
				
			||||||
  - vllm/distributed/
 | 
					  - vllm/distributed/
 | 
				
			||||||
  - vllm/engine/
 | 
					  - vllm/engine/
 | 
				
			||||||
  - vllm/executor/
 | 
					  - vllm/executor/
 | 
				
			||||||
  - vllm/model_executor/models/
 | 
					 | 
				
			||||||
  - tests/distributed/
 | 
					 | 
				
			||||||
  - vllm/compilation
 | 
					 | 
				
			||||||
  - vllm/worker/worker_base.py
 | 
					  - vllm/worker/worker_base.py
 | 
				
			||||||
  - vllm/worker/worker.py
 | 
					 | 
				
			||||||
  - vllm/worker/model_runner.py
 | 
					 | 
				
			||||||
  - entrypoints/llm/test_collective_rpc.py
 | 
					 | 
				
			||||||
  - tests/v1/test_async_llm_dp.py
 | 
					 | 
				
			||||||
  - tests/v1/test_external_lb_dp.py
 | 
					 | 
				
			||||||
  - tests/v1/entrypoints/openai/test_multi_api_servers.py
 | 
					 | 
				
			||||||
  - vllm/v1/engine/
 | 
					  - vllm/v1/engine/
 | 
				
			||||||
 | 
					  - vllm/v1/worker/
 | 
				
			||||||
 | 
					  - tests/compile/test_basic_correctness.py
 | 
				
			||||||
 | 
					  - tests/compile/test_wrapper.py
 | 
				
			||||||
 | 
					  - tests/distributed/
 | 
				
			||||||
 | 
					  - tests/entrypoints/llm/test_collective_rpc.py
 | 
				
			||||||
 | 
					  - tests/v1/distributed
 | 
				
			||||||
 | 
					  - tests/v1/entrypoints/openai/test_multi_api_servers.py
 | 
				
			||||||
 | 
					  - tests/v1/shutdown
 | 
				
			||||||
 | 
					  - tests/v1/worker/test_worker_memory_snapshot.py
 | 
				
			||||||
  commands:
 | 
					  commands:
 | 
				
			||||||
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
 | 
					  # https://github.com/NVIDIA/nccl/issues/1838
 | 
				
			||||||
  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py
 | 
					  - export NCCL_CUMEM_HOST_ENABLE=0
 | 
				
			||||||
 | 
					  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py
 | 
				
			||||||
 | 
					  - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py
 | 
				
			||||||
  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
 | 
					  - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
 | 
				
			||||||
  - pytest -v -s entrypoints/llm/test_collective_rpc.py
 | 
					  - pytest -v -s entrypoints/llm/test_collective_rpc.py
 | 
				
			||||||
  - pytest -v -s ./compile/test_basic_correctness.py
 | 
					  - pytest -v -s ./compile/test_basic_correctness.py
 | 
				
			||||||
  - pytest -v -s ./compile/test_wrapper.py
 | 
					  - pytest -v -s ./compile/test_wrapper.py
 | 
				
			||||||
  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
 | 
					  - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
 | 
				
			||||||
 | 
					  - VLLM_TEST_SAME_HOST=1 VLLM_TEST_WITH_DEFAULT_DEVICE_SET=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
 | 
				
			||||||
 | 
					  - pytest -v -s distributed/test_sequence_parallel.py
 | 
				
			||||||
 | 
					  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
 | 
				
			||||||
 | 
					  - pytest -v -s v1/worker/test_worker_memory_snapshot.py
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- label: Distributed Model Tests (2 GPUs) # 37min
 | 
				
			||||||
 | 
					  timeout_in_minutes: 50
 | 
				
			||||||
 | 
					  mirror_hardwares: [amdexperimental]
 | 
				
			||||||
 | 
					  working_dir: "/vllm-workspace/tests"
 | 
				
			||||||
 | 
					  num_gpus: 2
 | 
				
			||||||
 | 
					  source_file_dependencies:
 | 
				
			||||||
 | 
					  - vllm/model_executor/model_loader/sharded_state_loader.py
 | 
				
			||||||
 | 
					  - vllm/model_executor/models/
 | 
				
			||||||
 | 
					  - tests/basic_correctness/
 | 
				
			||||||
 | 
					  - tests/model_executor/model_loader/test_sharded_state_loader.py
 | 
				
			||||||
 | 
					  - tests/models/
 | 
				
			||||||
 | 
					  commands:
 | 
				
			||||||
  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
 | 
					  - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m 'distributed(num_gpus=2)'
 | 
				
			||||||
 | 
					  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s model_executor/model_loader/test_sharded_state_loader.py
 | 
				
			||||||
  # Avoid importing model tests that cause CUDA reinitialization error
 | 
					  # Avoid importing model tests that cause CUDA reinitialization error
 | 
				
			||||||
  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
 | 
					  - pytest models/test_transformers.py -v -s -m 'distributed(num_gpus=2)'
 | 
				
			||||||
  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
 | 
					  - pytest models/language -v -s -m 'distributed(num_gpus=2)'
 | 
				
			||||||
  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
 | 
					  - pytest models/multimodal -v -s -m 'distributed(num_gpus=2)' --ignore models/multimodal/generation/test_whisper.py
 | 
				
			||||||
  - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
 | 
					  - VLLM_WORKER_MULTIPROC_METHOD=spawn pytest models/multimodal/generation/test_whisper.py -v -s -m 'distributed(num_gpus=2)'
 | 
				
			||||||
  # test sequence parallel
 | 
					 | 
				
			||||||
  - pytest -v -s distributed/test_sequence_parallel.py
 | 
					 | 
				
			||||||
  # this test fails consistently.
 | 
					 | 
				
			||||||
  # TODO: investigate and fix
 | 
					 | 
				
			||||||
  - VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
 | 
					 | 
				
			||||||
  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
 | 
					 | 
				
			||||||
  - pytest -v -s models/multimodal/generation/test_maverick.py
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
- label: Plugin Tests (2 GPUs) # 40min
 | 
					- label: Plugin Tests (2 GPUs) # 40min
 | 
				
			||||||
  timeout_in_minutes: 60
 | 
					  timeout_in_minutes: 60
 | 
				
			||||||
@ -876,6 +1031,11 @@ steps:
 | 
				
			|||||||
  - pytest -v -s plugins_tests/test_io_processor_plugins.py
 | 
					  - pytest -v -s plugins_tests/test_io_processor_plugins.py
 | 
				
			||||||
  - pip uninstall prithvi_io_processor_plugin -y
 | 
					  - pip uninstall prithvi_io_processor_plugin -y
 | 
				
			||||||
  # end io_processor plugins test
 | 
					  # end io_processor plugins test
 | 
				
			||||||
 | 
					  # begin stat_logger plugins test
 | 
				
			||||||
 | 
					  - pip install -e ./plugins/vllm_add_dummy_stat_logger
 | 
				
			||||||
 | 
					  - pytest -v -s plugins_tests/test_stats_logger_plugins.py
 | 
				
			||||||
 | 
					  - pip uninstall dummy_stat_logger -y
 | 
				
			||||||
 | 
					  # end stat_logger plugins test
 | 
				
			||||||
  # other tests continue here:
 | 
					  # other tests continue here:
 | 
				
			||||||
  - pytest -v -s plugins_tests/test_scheduler_plugins.py
 | 
					  - pytest -v -s plugins_tests/test_scheduler_plugins.py
 | 
				
			||||||
  - pip install -e ./plugins/vllm_add_dummy_model
 | 
					  - pip install -e ./plugins/vllm_add_dummy_model
 | 
				
			||||||
@ -898,7 +1058,6 @@ steps:
 | 
				
			|||||||
  commands:
 | 
					  commands:
 | 
				
			||||||
  - pytest -v -s distributed/test_pp_cudagraph.py
 | 
					  - pytest -v -s distributed/test_pp_cudagraph.py
 | 
				
			||||||
  - pytest -v -s distributed/test_pipeline_parallel.py
 | 
					  - pytest -v -s distributed/test_pipeline_parallel.py
 | 
				
			||||||
  # - pytest -v -s distributed/test_context_parallel.py # TODO: enable it on Hopper runners or add triton MLA support
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
- label: LoRA TP Test (Distributed) # 17 min
 | 
					- label: LoRA TP Test (Distributed) # 17 min
 | 
				
			||||||
  timeout_in_minutes: 30
 | 
					  timeout_in_minutes: 30
 | 
				
			||||||
@ -916,6 +1075,7 @@ steps:
 | 
				
			|||||||
    - pytest -v -s -x lora/test_chatglm3_tp.py
 | 
					    - pytest -v -s -x lora/test_chatglm3_tp.py
 | 
				
			||||||
    - pytest -v -s -x lora/test_llama_tp.py
 | 
					    - pytest -v -s -x lora/test_llama_tp.py
 | 
				
			||||||
    - pytest -v -s -x lora/test_llm_with_multi_loras.py
 | 
					    - pytest -v -s -x lora/test_llm_with_multi_loras.py
 | 
				
			||||||
 | 
					    - pytest -v -s -x lora/test_olmoe_tp.py
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- label: Weight Loading Multiple GPU Test  # 33min
 | 
					- label: Weight Loading Multiple GPU Test  # 33min
 | 
				
			||||||
@ -942,6 +1102,17 @@ steps:
 | 
				
			|||||||
  commands:
 | 
					  commands:
 | 
				
			||||||
    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
 | 
					    - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
 | 
				
			||||||
  
 | 
					  
 | 
				
			||||||
 | 
					- label: NixlConnector PD accuracy tests (Distributed) # 30min
 | 
				
			||||||
 | 
					  timeout_in_minutes: 30
 | 
				
			||||||
 | 
					  working_dir: "/vllm-workspace/tests"
 | 
				
			||||||
 | 
					  num_gpus: 4
 | 
				
			||||||
 | 
					  source_file_dependencies:
 | 
				
			||||||
 | 
					    - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py
 | 
				
			||||||
 | 
					    - tests/v1/kv_connector/nixl_integration/
 | 
				
			||||||
 | 
					  commands:
 | 
				
			||||||
 | 
					    - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt
 | 
				
			||||||
 | 
					    - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
##### multi gpus test #####
 | 
					##### multi gpus test #####
 | 
				
			||||||
##### A100 test #####
 | 
					##### A100 test #####
 | 
				
			||||||
@ -972,9 +1143,38 @@ steps:
 | 
				
			|||||||
  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
 | 
					  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
 | 
				
			||||||
  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 | 
					  - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- label: Qwen MoE EP Test # optional
 | 
					##### H200 test #####
 | 
				
			||||||
 | 
					- label: Distributed Tests (H200) # optional
 | 
				
			||||||
  gpu: h200
 | 
					  gpu: h200
 | 
				
			||||||
  optional: true
 | 
					  optional: true
 | 
				
			||||||
 | 
					  working_dir: "/vllm-workspace/"
 | 
				
			||||||
  num_gpus: 2
 | 
					  num_gpus: 2
 | 
				
			||||||
  commands:
 | 
					  commands:
 | 
				
			||||||
    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 /vllm-workspace/examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
 | 
					    - pytest -v -s tests/compile/test_async_tp.py
 | 
				
			||||||
 | 
					    - pytest -v -s tests/compile/test_sequence_parallelism.py
 | 
				
			||||||
 | 
					    - pytest -v -s tests/compile/test_fusion_all_reduce.py
 | 
				
			||||||
 | 
					    - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm
 | 
				
			||||||
 | 
					    - pytest -v -s tests/distributed/test_context_parallel.py
 | 
				
			||||||
 | 
					    - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					##### B200 test #####
 | 
				
			||||||
 | 
					- label: Distributed Tests (B200) # optional
 | 
				
			||||||
 | 
					  gpu: b200
 | 
				
			||||||
 | 
					  optional: true
 | 
				
			||||||
 | 
					  working_dir: "/vllm-workspace/"
 | 
				
			||||||
 | 
					  num_gpus: 2
 | 
				
			||||||
 | 
					  commands:
 | 
				
			||||||
 | 
					    - pytest -v -s tests/distributed/test_context_parallel.py
 | 
				
			||||||
 | 
					    - pytest -v -s tests/distributed/test_nccl_symm_mem_allreduce.py
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					##### RL Integration Tests #####
 | 
				
			||||||
 | 
					- label: Prime-RL Integration Test # 15min
 | 
				
			||||||
 | 
					  timeout_in_minutes: 30
 | 
				
			||||||
 | 
					  optional: true
 | 
				
			||||||
 | 
					  num_gpus: 2
 | 
				
			||||||
 | 
					  working_dir: "/vllm-workspace"
 | 
				
			||||||
 | 
					  source_file_dependencies:
 | 
				
			||||||
 | 
					  - vllm/
 | 
				
			||||||
 | 
					  - .buildkite/scripts/run-prime-rl-test.sh
 | 
				
			||||||
 | 
					  commands:
 | 
				
			||||||
 | 
					    - bash .buildkite/scripts/run-prime-rl-test.sh
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										47
									
								
								.coveragerc
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										47
									
								
								.coveragerc
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,47 @@
 | 
				
			|||||||
 | 
					[run]
 | 
				
			||||||
 | 
					# Track the installed vllm package (this is what actually gets imported during tests)
 | 
				
			||||||
 | 
					# Use wildcard pattern to match the installed location
 | 
				
			||||||
 | 
					source =
 | 
				
			||||||
 | 
					    vllm
 | 
				
			||||||
 | 
					    */dist-packages/vllm
 | 
				
			||||||
 | 
					    */site-packages/vllm
 | 
				
			||||||
 | 
					omit =
 | 
				
			||||||
 | 
					    */tests/*
 | 
				
			||||||
 | 
					    */test_*
 | 
				
			||||||
 | 
					    */__pycache__/*
 | 
				
			||||||
 | 
					    */build/*
 | 
				
			||||||
 | 
					    */dist/*
 | 
				
			||||||
 | 
					    */vllm.egg-info/*
 | 
				
			||||||
 | 
					    */third_party/*
 | 
				
			||||||
 | 
					    */examples/*
 | 
				
			||||||
 | 
					    */benchmarks/*
 | 
				
			||||||
 | 
					    */docs/*
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[paths]
 | 
				
			||||||
 | 
					# Map all possible vllm locations to a canonical "vllm" path
 | 
				
			||||||
 | 
					# This ensures coverage.combine properly merges data from different test runs
 | 
				
			||||||
 | 
					source =
 | 
				
			||||||
 | 
					    vllm
 | 
				
			||||||
 | 
					    /vllm-workspace/src/vllm
 | 
				
			||||||
 | 
					    /vllm-workspace/vllm
 | 
				
			||||||
 | 
					    */site-packages/vllm
 | 
				
			||||||
 | 
					    */dist-packages/vllm
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[report]
 | 
				
			||||||
 | 
					exclude_lines =
 | 
				
			||||||
 | 
					    pragma: no cover
 | 
				
			||||||
 | 
					    def __repr__
 | 
				
			||||||
 | 
					    if self.debug:
 | 
				
			||||||
 | 
					    if settings.DEBUG
 | 
				
			||||||
 | 
					    raise AssertionError
 | 
				
			||||||
 | 
					    raise NotImplementedError
 | 
				
			||||||
 | 
					    if 0:
 | 
				
			||||||
 | 
					    if __name__ == .__main__.:
 | 
				
			||||||
 | 
					    class .*\bProtocol\):
 | 
				
			||||||
 | 
					    @(abc\.)?abstractmethod
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[html]
 | 
				
			||||||
 | 
					directory = htmlcov
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					[xml]
 | 
				
			||||||
 | 
					output = coverage.xml
 | 
				
			||||||
							
								
								
									
										4
									
								
								.git-blame-ignore-revs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4
									
								
								.git-blame-ignore-revs
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,4 @@
 | 
				
			|||||||
 | 
					# Migrate from `yapf` & `isort` to `ruff`
 | 
				
			||||||
 | 
					d6953beb91da4e9c99be4c0a1304a2d24189535c
 | 
				
			||||||
 | 
					# Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y`
 | 
				
			||||||
 | 
					8fcaaf6a165e661f63fc51be906bc05b0767332f
 | 
				
			||||||
							
								
								
									
										65
									
								
								.github/CODEOWNERS
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										65
									
								
								.github/CODEOWNERS
									
									
									
									
										vendored
									
									
								
							@ -2,72 +2,86 @@
 | 
				
			|||||||
# for more info about CODEOWNERS file
 | 
					# for more info about CODEOWNERS file
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# This lists cover the "core" components of vLLM that require careful review
 | 
					# This lists cover the "core" components of vLLM that require careful review
 | 
				
			||||||
 | 
					/vllm/attention @LucasWilkinson
 | 
				
			||||||
/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 | 
					/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 | 
				
			||||||
/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 | 
					 | 
				
			||||||
/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 | 
					 | 
				
			||||||
/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 | 
					/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 | 
				
			||||||
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn
 | 
					/vllm/model_executor/layers/fused_moe @mgoin @pavanimajety
 | 
				
			||||||
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
 | 
					/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 @pavanimajety
 | 
				
			||||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche
 | 
					 | 
				
			||||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
 | 
					 | 
				
			||||||
/vllm/model_executor/layers/mamba @tdoublep
 | 
					/vllm/model_executor/layers/mamba @tdoublep
 | 
				
			||||||
/vllm/model_executor/model_loader @22quinn
 | 
					/vllm/model_executor/model_loader @22quinn
 | 
				
			||||||
/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
 | 
					/vllm/multimodal @DarkLight1337 @ywang96 @NickLucche
 | 
				
			||||||
/vllm/v1/sample @22quinn @houseroad
 | 
					 | 
				
			||||||
/vllm/vllm_flash_attn @LucasWilkinson
 | 
					/vllm/vllm_flash_attn @LucasWilkinson
 | 
				
			||||||
/vllm/lora @jeejeelee
 | 
					/vllm/lora @jeejeelee
 | 
				
			||||||
/vllm/reasoning @aarnphm @chaunceyjiang
 | 
					/vllm/reasoning @aarnphm @chaunceyjiang
 | 
				
			||||||
/vllm/entrypoints @aarnphm @chaunceyjiang
 | 
					/vllm/entrypoints @aarnphm @chaunceyjiang
 | 
				
			||||||
/vllm/compilation @zou3519 @youkaichao @ProExpertProg
 | 
					/vllm/compilation @zou3519 @youkaichao @ProExpertProg
 | 
				
			||||||
/vllm/distributed/kv_transfer @NickLucche
 | 
					/vllm/distributed/kv_transfer @NickLucche @ApostaC
 | 
				
			||||||
CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 | 
					CMakeLists.txt @tlrmchlsmth @LucasWilkinson
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Any change to the VllmConfig changes can have a large user-facing impact,
 | 
					# Any change to the VllmConfig changes can have a large user-facing impact,
 | 
				
			||||||
# so spam a lot of people
 | 
					# so spam a lot of people
 | 
				
			||||||
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
 | 
					/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
 | 
				
			||||||
 | 
					/vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# vLLM V1
 | 
					# vLLM V1
 | 
				
			||||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
 | 
					/vllm/v1/attention @LucasWilkinson
 | 
				
			||||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
 | 
					/vllm/v1/attention/backends/mla @pavanimajety
 | 
				
			||||||
/vllm/v1/spec_decode @benchislett @luccafong
 | 
					/vllm/v1/attention/backends/flashinfer.py @mgoin @pavanimajety
 | 
				
			||||||
/vllm/v1/attention/backends/triton_attn.py @tdoublep
 | 
					/vllm/v1/attention/backends/triton_attn.py @tdoublep
 | 
				
			||||||
/vllm/v1/core @heheda12345
 | 
					/vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
 | 
				
			||||||
 | 
					/vllm/v1/sample @22quinn @houseroad @njhill
 | 
				
			||||||
 | 
					/vllm/v1/spec_decode @benchislett @luccafong
 | 
				
			||||||
 | 
					/vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett
 | 
				
			||||||
/vllm/v1/kv_cache_interface.py @heheda12345
 | 
					/vllm/v1/kv_cache_interface.py @heheda12345
 | 
				
			||||||
 | 
					/vllm/v1/offloading @ApostaC
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Test ownership
 | 
					# Test ownership
 | 
				
			||||||
/.buildkite/lm-eval-harness @mgoin @simon-mo
 | 
					/.buildkite/lm-eval-harness @mgoin @simon-mo
 | 
				
			||||||
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
 | 
					 | 
				
			||||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
 | 
					/tests/distributed/test_multi_node_assignment.py @youkaichao
 | 
				
			||||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
 | 
					/tests/distributed/test_pipeline_parallel.py @youkaichao
 | 
				
			||||||
/tests/distributed/test_same_node.py @youkaichao
 | 
					/tests/distributed/test_same_node.py @youkaichao
 | 
				
			||||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
 | 
					/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm @NickLucche
 | 
				
			||||||
/tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
 | 
					/tests/evals @mgoin
 | 
				
			||||||
 | 
					/tests/kernels @mgoin @tlrmchlsmth @WoosukKwon @yewentao256
 | 
				
			||||||
/tests/models @DarkLight1337 @ywang96
 | 
					/tests/models @DarkLight1337 @ywang96
 | 
				
			||||||
/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
 | 
					/tests/multimodal @DarkLight1337 @ywang96 @NickLucche
 | 
				
			||||||
/tests/prefix_caching @comaniac @KuntaiDu
 | 
					/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256 @pavanimajety
 | 
				
			||||||
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
 | 
					 | 
				
			||||||
/tests/test_inputs.py @DarkLight1337 @ywang96
 | 
					/tests/test_inputs.py @DarkLight1337 @ywang96
 | 
				
			||||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 | 
					/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
 | 
				
			||||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
 | 
					/tests/v1/structured_output @mgoin @russellb @aarnphm
 | 
				
			||||||
/tests/v1/core @heheda12345
 | 
					/tests/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC
 | 
				
			||||||
/tests/weight_loading @mgoin @youkaichao @yewentao256
 | 
					/tests/weight_loading @mgoin @youkaichao @yewentao256
 | 
				
			||||||
/tests/lora @jeejeelee
 | 
					/tests/lora @jeejeelee
 | 
				
			||||||
/tests/models/language/generation/test_hybrid.py @tdoublep
 | 
					/tests/models/language/generation/test_hybrid.py @tdoublep
 | 
				
			||||||
/tests/v1/kv_connector/nixl_integration @NickLucche
 | 
					/tests/v1/kv_connector/nixl_integration @NickLucche
 | 
				
			||||||
 | 
					/tests/v1/kv_connector @ApostaC
 | 
				
			||||||
 | 
					/tests/v1/offloading @ApostaC
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Transformers backend
 | 
				
			||||||
 | 
					/vllm/model_executor/models/transformers @hmellor
 | 
				
			||||||
 | 
					/tests/models/test_transformers.py @hmellor
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Docs
 | 
					# Docs
 | 
				
			||||||
/docs @hmellor
 | 
					/docs/mkdocs @hmellor
 | 
				
			||||||
 | 
					/docs/**/*.yml @hmellor
 | 
				
			||||||
 | 
					/requirements/docs.txt @hmellor
 | 
				
			||||||
 | 
					.readthedocs.yaml @hmellor
 | 
				
			||||||
mkdocs.yaml @hmellor
 | 
					mkdocs.yaml @hmellor
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Linting
 | 
				
			||||||
 | 
					.markdownlint.yaml @hmellor
 | 
				
			||||||
 | 
					.pre-commit-config.yaml @hmellor
 | 
				
			||||||
 | 
					/tools/pre_commit @hmellor
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# CPU
 | 
					# CPU
 | 
				
			||||||
/vllm/v1/worker/^cpu @bigPYJ1151
 | 
					/vllm/v1/worker/cpu* @bigPYJ1151
 | 
				
			||||||
/csrc/cpu @bigPYJ1151
 | 
					/csrc/cpu @bigPYJ1151
 | 
				
			||||||
/vllm/platforms/cpu.py @bigPYJ1151
 | 
					/vllm/platforms/cpu.py @bigPYJ1151
 | 
				
			||||||
/cmake/cpu_extension.cmake @bigPYJ1151
 | 
					/cmake/cpu_extension.cmake @bigPYJ1151
 | 
				
			||||||
/docker/Dockerfile.cpu @bigPYJ1151
 | 
					/docker/Dockerfile.cpu @bigPYJ1151
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Intel GPU
 | 
					# Intel GPU
 | 
				
			||||||
/vllm/v1/worker/^xpu @jikunshang
 | 
					/vllm/v1/worker/xpu* @jikunshang
 | 
				
			||||||
/vllm/platforms/xpu.py @jikunshang
 | 
					/vllm/platforms/xpu.py @jikunshang
 | 
				
			||||||
/docker/Dockerfile.xpu @jikunshang
 | 
					/docker/Dockerfile.xpu @jikunshang
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -102,3 +116,14 @@ mkdocs.yaml @hmellor
 | 
				
			|||||||
/vllm/platforms/tpu.py @NickLucche
 | 
					/vllm/platforms/tpu.py @NickLucche
 | 
				
			||||||
/vllm/v1/sample/tpu @NickLucche
 | 
					/vllm/v1/sample/tpu @NickLucche
 | 
				
			||||||
/vllm/tests/v1/tpu @NickLucche
 | 
					/vllm/tests/v1/tpu @NickLucche
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# KVConnector installation files
 | 
				
			||||||
 | 
					/requirements/kv_connectors.txt @NickLucche
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Pooling models
 | 
				
			||||||
 | 
					/examples/*/pooling/ @noooop
 | 
				
			||||||
 | 
					/tests/models/*/pooling* @noooop
 | 
				
			||||||
 | 
					/tests/entrypoints/pooling @noooop
 | 
				
			||||||
 | 
					/vllm/config/pooler.py @noooop
 | 
				
			||||||
 | 
					/vllm/pooling_params.py @noooop
 | 
				
			||||||
 | 
					/vllm/model_executor/layers/pooler.py @noooop
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										4
									
								
								.github/ISSUE_TEMPLATE/750-RFC.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										4
									
								
								.github/ISSUE_TEMPLATE/750-RFC.yml
									
									
									
									
										vendored
									
									
								
							@ -43,10 +43,6 @@ body:
 | 
				
			|||||||
      Any other things you would like to mention.
 | 
					      Any other things you would like to mention.
 | 
				
			||||||
  validations:
 | 
					  validations:
 | 
				
			||||||
    required: false
 | 
					    required: false
 | 
				
			||||||
- type: markdown
 | 
					 | 
				
			||||||
  attributes:
 | 
					 | 
				
			||||||
    value: >
 | 
					 | 
				
			||||||
      Thanks for contributing 🎉! The vLLM core team hosts a biweekly RFC review session at 9:30AM Pacific Time, while most RFCs can be discussed online, you can optionally sign up for a slot to discuss your RFC online [here](https://docs.google.com/document/d/1CiLVBZeIVfR7_PNAKVSusxpceywkoOOB78qoWqHvSZc/edit).
 | 
					 | 
				
			||||||
- type: checkboxes
 | 
					- type: checkboxes
 | 
				
			||||||
  id: askllm
 | 
					  id: askllm
 | 
				
			||||||
  attributes:
 | 
					  attributes:
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										53
									
								
								.github/mergify.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										53
									
								
								.github/mergify.yml
									
									
									
									
										vendored
									
									
								
							@ -2,6 +2,7 @@ pull_request_rules:
 | 
				
			|||||||
- name: label-documentation
 | 
					- name: label-documentation
 | 
				
			||||||
  description: Automatically apply documentation label
 | 
					  description: Automatically apply documentation label
 | 
				
			||||||
  conditions:
 | 
					  conditions:
 | 
				
			||||||
 | 
					    - label != stale
 | 
				
			||||||
    - or:
 | 
					    - or:
 | 
				
			||||||
      - files~=^[^/]+\.md$
 | 
					      - files~=^[^/]+\.md$
 | 
				
			||||||
      - files~=^docs/
 | 
					      - files~=^docs/
 | 
				
			||||||
@ -10,10 +11,13 @@ pull_request_rules:
 | 
				
			|||||||
    label:
 | 
					    label:
 | 
				
			||||||
      add:
 | 
					      add:
 | 
				
			||||||
        - documentation
 | 
					        - documentation
 | 
				
			||||||
 | 
					    comment:
 | 
				
			||||||
 | 
					      message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
- name: label-ci-build
 | 
					- name: label-ci-build
 | 
				
			||||||
  description: Automatically apply ci/build label
 | 
					  description: Automatically apply ci/build label
 | 
				
			||||||
  conditions:
 | 
					  conditions:
 | 
				
			||||||
 | 
					    - label != stale
 | 
				
			||||||
    - or:
 | 
					    - or:
 | 
				
			||||||
      - files~=^\.github/
 | 
					      - files~=^\.github/
 | 
				
			||||||
      - files~=\.buildkite/
 | 
					      - files~=\.buildkite/
 | 
				
			||||||
@ -30,6 +34,7 @@ pull_request_rules:
 | 
				
			|||||||
- name: label-deepseek
 | 
					- name: label-deepseek
 | 
				
			||||||
  description: Automatically apply deepseek label
 | 
					  description: Automatically apply deepseek label
 | 
				
			||||||
  conditions:
 | 
					  conditions:
 | 
				
			||||||
 | 
					    - label != stale
 | 
				
			||||||
    - or:
 | 
					    - or:
 | 
				
			||||||
      - files~=^examples/.*deepseek.*\.py
 | 
					      - files~=^examples/.*deepseek.*\.py
 | 
				
			||||||
      - files~=^tests/.*deepseek.*\.py
 | 
					      - files~=^tests/.*deepseek.*\.py
 | 
				
			||||||
@ -46,6 +51,7 @@ pull_request_rules:
 | 
				
			|||||||
- name: label-frontend
 | 
					- name: label-frontend
 | 
				
			||||||
  description: Automatically apply frontend label
 | 
					  description: Automatically apply frontend label
 | 
				
			||||||
  conditions:
 | 
					  conditions:
 | 
				
			||||||
 | 
					    - label != stale
 | 
				
			||||||
    - files~=^vllm/entrypoints/
 | 
					    - files~=^vllm/entrypoints/
 | 
				
			||||||
  actions:
 | 
					  actions:
 | 
				
			||||||
    label:
 | 
					    label:
 | 
				
			||||||
@ -55,6 +61,7 @@ pull_request_rules:
 | 
				
			|||||||
- name: label-llama
 | 
					- name: label-llama
 | 
				
			||||||
  description: Automatically apply llama label
 | 
					  description: Automatically apply llama label
 | 
				
			||||||
  conditions:
 | 
					  conditions:
 | 
				
			||||||
 | 
					    - label != stale
 | 
				
			||||||
    - or:
 | 
					    - or:
 | 
				
			||||||
      - files~=^examples/.*llama.*\.py
 | 
					      - files~=^examples/.*llama.*\.py
 | 
				
			||||||
      - files~=^tests/.*llama.*\.py
 | 
					      - files~=^tests/.*llama.*\.py
 | 
				
			||||||
@ -70,6 +77,7 @@ pull_request_rules:
 | 
				
			|||||||
- name: label-multi-modality
 | 
					- name: label-multi-modality
 | 
				
			||||||
  description: Automatically apply multi-modality label
 | 
					  description: Automatically apply multi-modality label
 | 
				
			||||||
  conditions:
 | 
					  conditions:
 | 
				
			||||||
 | 
					    - label != stale
 | 
				
			||||||
    - or:
 | 
					    - or:
 | 
				
			||||||
      - files~=^vllm/multimodal/
 | 
					      - files~=^vllm/multimodal/
 | 
				
			||||||
      - files~=^tests/multimodal/
 | 
					      - files~=^tests/multimodal/
 | 
				
			||||||
@ -83,6 +91,7 @@ pull_request_rules:
 | 
				
			|||||||
- name: label-new-model
 | 
					- name: label-new-model
 | 
				
			||||||
  description: Automatically apply new-model label
 | 
					  description: Automatically apply new-model label
 | 
				
			||||||
  conditions:
 | 
					  conditions:
 | 
				
			||||||
 | 
					    - label != stale
 | 
				
			||||||
    - and:
 | 
					    - and:
 | 
				
			||||||
      - files~=^vllm/model_executor/models/
 | 
					      - files~=^vllm/model_executor/models/
 | 
				
			||||||
      - files=vllm/model_executor/models/registry.py
 | 
					      - files=vllm/model_executor/models/registry.py
 | 
				
			||||||
@ -94,6 +103,7 @@ pull_request_rules:
 | 
				
			|||||||
- name: label-performance
 | 
					- name: label-performance
 | 
				
			||||||
  description: Automatically apply performance label
 | 
					  description: Automatically apply performance label
 | 
				
			||||||
  conditions:
 | 
					  conditions:
 | 
				
			||||||
 | 
					    - label != stale
 | 
				
			||||||
    - or:
 | 
					    - or:
 | 
				
			||||||
      - files~=^benchmarks/
 | 
					      - files~=^benchmarks/
 | 
				
			||||||
      - files~=^vllm/benchmarks/
 | 
					      - files~=^vllm/benchmarks/
 | 
				
			||||||
@ -107,6 +117,7 @@ pull_request_rules:
 | 
				
			|||||||
- name: label-qwen
 | 
					- name: label-qwen
 | 
				
			||||||
  description: Automatically apply qwen label
 | 
					  description: Automatically apply qwen label
 | 
				
			||||||
  conditions:
 | 
					  conditions:
 | 
				
			||||||
 | 
					    - label != stale
 | 
				
			||||||
    - or:
 | 
					    - or:
 | 
				
			||||||
      - files~=^examples/.*qwen.*\.py
 | 
					      - files~=^examples/.*qwen.*\.py
 | 
				
			||||||
      - files~=^tests/.*qwen.*\.py
 | 
					      - files~=^tests/.*qwen.*\.py
 | 
				
			||||||
@ -121,12 +132,20 @@ pull_request_rules:
 | 
				
			|||||||
- name: label-gpt-oss
 | 
					- name: label-gpt-oss
 | 
				
			||||||
  description: Automatically apply gpt-oss label
 | 
					  description: Automatically apply gpt-oss label
 | 
				
			||||||
  conditions:
 | 
					  conditions:
 | 
				
			||||||
 | 
					    - label != stale
 | 
				
			||||||
    - or:
 | 
					    - or:
 | 
				
			||||||
      - files~=^examples/.*gpt[-_]?oss.*\.py
 | 
					      - files~=^examples/.*gpt[-_]?oss.*\.py
 | 
				
			||||||
      - files~=^tests/.*gpt[-_]?oss.*\.py
 | 
					      - files~=^tests/.*gpt[-_]?oss.*\.py
 | 
				
			||||||
 | 
					      - files~=^tests/entrypoints/openai/test_response_api_with_harmony.py
 | 
				
			||||||
 | 
					      - files~=^tests/entrypoints/test_context.py
 | 
				
			||||||
      - files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
 | 
					      - files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
 | 
				
			||||||
      - files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
 | 
					      - files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
 | 
				
			||||||
 | 
					      - files~=^vllm/entrypoints/harmony_utils.py
 | 
				
			||||||
 | 
					      - files~=^vllm/entrypoints/tool_server.py
 | 
				
			||||||
 | 
					      - files~=^vllm/entrypoints/tool.py
 | 
				
			||||||
 | 
					      - files~=^vllm/entrypoints/context.py
 | 
				
			||||||
      - title~=(?i)gpt[-_]?oss
 | 
					      - title~=(?i)gpt[-_]?oss
 | 
				
			||||||
 | 
					      - title~=(?i)harmony
 | 
				
			||||||
  actions:
 | 
					  actions:
 | 
				
			||||||
    label:
 | 
					    label:
 | 
				
			||||||
      add:
 | 
					      add:
 | 
				
			||||||
@ -135,6 +154,7 @@ pull_request_rules:
 | 
				
			|||||||
- name: label-rocm
 | 
					- name: label-rocm
 | 
				
			||||||
  description: Automatically apply rocm label
 | 
					  description: Automatically apply rocm label
 | 
				
			||||||
  conditions:
 | 
					  conditions:
 | 
				
			||||||
 | 
					    - label != stale
 | 
				
			||||||
    - or:
 | 
					    - or:
 | 
				
			||||||
      - files~=^csrc/rocm/
 | 
					      - files~=^csrc/rocm/
 | 
				
			||||||
      - files~=^docker/Dockerfile.rocm
 | 
					      - files~=^docker/Dockerfile.rocm
 | 
				
			||||||
@ -155,6 +175,7 @@ pull_request_rules:
 | 
				
			|||||||
- name: label-structured-output
 | 
					- name: label-structured-output
 | 
				
			||||||
  description: Automatically apply structured-output label
 | 
					  description: Automatically apply structured-output label
 | 
				
			||||||
  conditions:
 | 
					  conditions:
 | 
				
			||||||
 | 
					    - label != stale
 | 
				
			||||||
    - or:
 | 
					    - or:
 | 
				
			||||||
      - files~=^benchmarks/structured_schemas/
 | 
					      - files~=^benchmarks/structured_schemas/
 | 
				
			||||||
      - files=benchmarks/benchmark_serving_structured_output.py
 | 
					      - files=benchmarks/benchmark_serving_structured_output.py
 | 
				
			||||||
@ -164,7 +185,7 @@ pull_request_rules:
 | 
				
			|||||||
      - files=examples/online_serving/openai_chat_completion_structured_outputs.py
 | 
					      - files=examples/online_serving/openai_chat_completion_structured_outputs.py
 | 
				
			||||||
      - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
 | 
					      - files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
 | 
				
			||||||
      - files~=^tests/v1/structured_output/
 | 
					      - files~=^tests/v1/structured_output/
 | 
				
			||||||
      - files=tests/v1/entrypoints/llm/test_guided_generate.py
 | 
					      - files=tests/v1/entrypoints/llm/test_struct_output_generate.py
 | 
				
			||||||
      - files~=^vllm/v1/structured_output/
 | 
					      - files~=^vllm/v1/structured_output/
 | 
				
			||||||
  actions:
 | 
					  actions:
 | 
				
			||||||
    label:
 | 
					    label:
 | 
				
			||||||
@ -174,6 +195,7 @@ pull_request_rules:
 | 
				
			|||||||
- name: label-speculative-decoding
 | 
					- name: label-speculative-decoding
 | 
				
			||||||
  description: Automatically apply speculative-decoding label
 | 
					  description: Automatically apply speculative-decoding label
 | 
				
			||||||
  conditions:
 | 
					  conditions:
 | 
				
			||||||
 | 
					    - label != stale
 | 
				
			||||||
    - or:
 | 
					    - or:
 | 
				
			||||||
      - files~=^vllm/v1/spec_decode/
 | 
					      - files~=^vllm/v1/spec_decode/
 | 
				
			||||||
      - files~=^tests/v1/spec_decode/
 | 
					      - files~=^tests/v1/spec_decode/
 | 
				
			||||||
@ -189,6 +211,7 @@ pull_request_rules:
 | 
				
			|||||||
- name: label-v1
 | 
					- name: label-v1
 | 
				
			||||||
  description: Automatically apply v1 label
 | 
					  description: Automatically apply v1 label
 | 
				
			||||||
  conditions:
 | 
					  conditions:
 | 
				
			||||||
 | 
					    - label != stale
 | 
				
			||||||
    - or:
 | 
					    - or:
 | 
				
			||||||
      - files~=^vllm/v1/
 | 
					      - files~=^vllm/v1/
 | 
				
			||||||
      - files~=^tests/v1/
 | 
					      - files~=^tests/v1/
 | 
				
			||||||
@ -201,6 +224,7 @@ pull_request_rules:
 | 
				
			|||||||
  description: Automatically apply tpu label
 | 
					  description: Automatically apply tpu label
 | 
				
			||||||
  # Keep this list in sync with `label-tpu-remove` conditions
 | 
					  # Keep this list in sync with `label-tpu-remove` conditions
 | 
				
			||||||
  conditions:
 | 
					  conditions:
 | 
				
			||||||
 | 
					    - label != stale
 | 
				
			||||||
    - or:
 | 
					    - or:
 | 
				
			||||||
      - files~=tpu.py
 | 
					      - files~=tpu.py
 | 
				
			||||||
      - files~=_tpu
 | 
					      - files~=_tpu
 | 
				
			||||||
@ -216,6 +240,7 @@ pull_request_rules:
 | 
				
			|||||||
  description: Automatically remove tpu label
 | 
					  description: Automatically remove tpu label
 | 
				
			||||||
  # Keep this list in sync with `label-tpu` conditions
 | 
					  # Keep this list in sync with `label-tpu` conditions
 | 
				
			||||||
  conditions:
 | 
					  conditions:
 | 
				
			||||||
 | 
					    - label != stale
 | 
				
			||||||
    - and:
 | 
					    - and:
 | 
				
			||||||
      - -files~=tpu.py
 | 
					      - -files~=tpu.py
 | 
				
			||||||
      - -files~=_tpu
 | 
					      - -files~=_tpu
 | 
				
			||||||
@ -230,9 +255,9 @@ pull_request_rules:
 | 
				
			|||||||
- name: label-tool-calling
 | 
					- name: label-tool-calling
 | 
				
			||||||
  description: Automatically add tool-calling label
 | 
					  description: Automatically add tool-calling label
 | 
				
			||||||
  conditions:
 | 
					  conditions:
 | 
				
			||||||
 | 
					    - label != stale
 | 
				
			||||||
    - or:
 | 
					    - or:
 | 
				
			||||||
      - files~=^tests/tool_use/
 | 
					      - files~=^tests/tool_use/
 | 
				
			||||||
      - files~=^tests/mistral_tool_use/
 | 
					 | 
				
			||||||
      - files~=^tests/entrypoints/openai/tool_parsers/
 | 
					      - files~=^tests/entrypoints/openai/tool_parsers/
 | 
				
			||||||
      - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
 | 
					      - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py
 | 
				
			||||||
      - files~=^vllm/entrypoints/openai/tool_parsers/
 | 
					      - files~=^vllm/entrypoints/openai/tool_parsers/
 | 
				
			||||||
@ -249,6 +274,7 @@ pull_request_rules:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
- name: ping author on conflicts and add 'needs-rebase' label
 | 
					- name: ping author on conflicts and add 'needs-rebase' label
 | 
				
			||||||
  conditions:
 | 
					  conditions:
 | 
				
			||||||
 | 
					    - label != stale
 | 
				
			||||||
    - conflict
 | 
					    - conflict
 | 
				
			||||||
    - -closed
 | 
					    - -closed
 | 
				
			||||||
  actions:
 | 
					  actions:
 | 
				
			||||||
@ -264,10 +290,12 @@ pull_request_rules:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
- name: assign reviewer for tensorizer changes
 | 
					- name: assign reviewer for tensorizer changes
 | 
				
			||||||
  conditions:
 | 
					  conditions:
 | 
				
			||||||
 | 
					    - label != stale
 | 
				
			||||||
 | 
					    - or:
 | 
				
			||||||
      - files~=^vllm/model_executor/model_loader/tensorizer.py
 | 
					      - files~=^vllm/model_executor/model_loader/tensorizer.py
 | 
				
			||||||
      - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
 | 
					      - files~=^vllm/model_executor/model_loader/tensorizer_loader.py
 | 
				
			||||||
      - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
 | 
					      - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py
 | 
				
			||||||
      - files~=^tests/tensorizer_loader/
 | 
					      - files~=^tests/model_executor/model_loader/tensorizer_loader/
 | 
				
			||||||
  actions:
 | 
					  actions:
 | 
				
			||||||
    assign:
 | 
					    assign:
 | 
				
			||||||
      users:
 | 
					      users:
 | 
				
			||||||
@ -275,6 +303,7 @@ pull_request_rules:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
- name: assign reviewer for modelopt changes
 | 
					- name: assign reviewer for modelopt changes
 | 
				
			||||||
  conditions:
 | 
					  conditions:
 | 
				
			||||||
 | 
					    - label != stale
 | 
				
			||||||
    - or:
 | 
					    - or:
 | 
				
			||||||
        - files~=^vllm/model_executor/layers/quantization/modelopt\.py$
 | 
					        - files~=^vllm/model_executor/layers/quantization/modelopt\.py$
 | 
				
			||||||
        - files~=^vllm/model_executor/layers/quantization/__init__\.py$
 | 
					        - files~=^vllm/model_executor/layers/quantization/__init__\.py$
 | 
				
			||||||
@ -295,3 +324,21 @@ pull_request_rules:
 | 
				
			|||||||
    label:
 | 
					    label:
 | 
				
			||||||
      remove:
 | 
					      remove:
 | 
				
			||||||
        - needs-rebase
 | 
					        - needs-rebase
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- name: label-kv-connector
 | 
				
			||||||
 | 
					  description: Automatically apply kv-connector label
 | 
				
			||||||
 | 
					  conditions:
 | 
				
			||||||
 | 
					    - label != stale
 | 
				
			||||||
 | 
					    - or:
 | 
				
			||||||
 | 
					      - files~=^examples/online_serving/disaggregated[^/]*/.*
 | 
				
			||||||
 | 
					      - files~=^examples/offline_inference/disaggregated[^/]*/.*
 | 
				
			||||||
 | 
					      - files~=^examples/others/lmcache/
 | 
				
			||||||
 | 
					      - files~=^tests/v1/kv_connector/
 | 
				
			||||||
 | 
					      - files~=^vllm/distributed/kv_transfer/
 | 
				
			||||||
 | 
					      - title~=(?i)\bP/?D\b
 | 
				
			||||||
 | 
					      - title~=(?i)NIXL
 | 
				
			||||||
 | 
					      - title~=(?i)LMCache
 | 
				
			||||||
 | 
					  actions:
 | 
				
			||||||
 | 
					    label:
 | 
				
			||||||
 | 
					      add:
 | 
				
			||||||
 | 
					        - kv-connector
 | 
				
			||||||
							
								
								
									
										2
									
								
								.github/workflows/bc-lint.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/bc-lint.yml
									
									
									
									
										vendored
									
									
								
							@ -6,6 +6,8 @@ on:
 | 
				
			|||||||
      - opened
 | 
					      - opened
 | 
				
			||||||
      - synchronize
 | 
					      - synchronize
 | 
				
			||||||
      - reopened
 | 
					      - reopened
 | 
				
			||||||
 | 
					      - labeled
 | 
				
			||||||
 | 
					      - unlabeled
 | 
				
			||||||
 | 
					
 | 
				
			||||||
jobs:
 | 
					jobs:
 | 
				
			||||||
  bc_lint:
 | 
					  bc_lint:
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										130
									
								
								.github/workflows/issue_autolabel.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										130
									
								
								.github/workflows/issue_autolabel.yml
									
									
									
									
										vendored
									
									
								
							@ -13,6 +13,7 @@ jobs:
 | 
				
			|||||||
    runs-on: ubuntu-latest
 | 
					    runs-on: ubuntu-latest
 | 
				
			||||||
    steps:
 | 
					    steps:
 | 
				
			||||||
      - name: Label issues based on keywords
 | 
					      - name: Label issues based on keywords
 | 
				
			||||||
 | 
					        id: label-step
 | 
				
			||||||
        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd  # v8.0.0
 | 
					        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd  # v8.0.0
 | 
				
			||||||
        with:
 | 
					        with:
 | 
				
			||||||
          script: |
 | 
					          script: |
 | 
				
			||||||
@ -42,7 +43,6 @@ jobs:
 | 
				
			|||||||
                    searchIn: "body"
 | 
					                    searchIn: "body"
 | 
				
			||||||
                  },
 | 
					                  },
 | 
				
			||||||
                ],
 | 
					                ],
 | 
				
			||||||
                
 | 
					 | 
				
			||||||
                // Substring search - matches anywhere in text (partial matches)
 | 
					                // Substring search - matches anywhere in text (partial matches)
 | 
				
			||||||
                substrings: [
 | 
					                substrings: [
 | 
				
			||||||
                  {
 | 
					                  {
 | 
				
			||||||
@ -89,14 +89,12 @@ jobs:
 | 
				
			|||||||
                    term: "hip_",
 | 
					                    term: "hip_",
 | 
				
			||||||
                    searchIn: "both"
 | 
					                    searchIn: "both"
 | 
				
			||||||
                  },
 | 
					                  },
 | 
				
			||||||
                  
 | 
					 | 
				
			||||||
                  // ROCm tools and libraries
 | 
					                  // ROCm tools and libraries
 | 
				
			||||||
                  {
 | 
					                  {
 | 
				
			||||||
                    term: "hipify",
 | 
					                    term: "hipify",
 | 
				
			||||||
                    searchIn: "both"
 | 
					                    searchIn: "both"
 | 
				
			||||||
                  },
 | 
					                  },
 | 
				
			||||||
                ],
 | 
					                ],
 | 
				
			||||||
                
 | 
					 | 
				
			||||||
                // Regex patterns - for complex pattern matching
 | 
					                // Regex patterns - for complex pattern matching
 | 
				
			||||||
                regexPatterns: [
 | 
					                regexPatterns: [
 | 
				
			||||||
                  {
 | 
					                  {
 | 
				
			||||||
@ -107,13 +105,17 @@ jobs:
 | 
				
			|||||||
                  }
 | 
					                  }
 | 
				
			||||||
                ],
 | 
					                ],
 | 
				
			||||||
              },
 | 
					              },
 | 
				
			||||||
 | 
					              // Add more label configurations here as needed
 | 
				
			||||||
 | 
					              // example: {
 | 
				
			||||||
 | 
					              //   keywords: [...],
 | 
				
			||||||
 | 
					              //   substrings: [...],
 | 
				
			||||||
 | 
					              //   regexPatterns: [...]
 | 
				
			||||||
 | 
					              // },
 | 
				
			||||||
            };
 | 
					            };
 | 
				
			||||||
            
 | 
					 | 
				
			||||||
            // Helper function to create regex based on search type
 | 
					            // Helper function to create regex based on search type
 | 
				
			||||||
            function createSearchRegex(term, type) {
 | 
					            function createSearchRegex(term, type) {
 | 
				
			||||||
              // Escape special regex characters in the term
 | 
					              // Escape special regex characters in the term
 | 
				
			||||||
              const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
 | 
					              const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
 | 
				
			||||||
              
 | 
					 | 
				
			||||||
              switch (type) {
 | 
					              switch (type) {
 | 
				
			||||||
                case 'keyword':
 | 
					                case 'keyword':
 | 
				
			||||||
                  // Word boundary search - matches whole words only
 | 
					                  // Word boundary search - matches whole words only
 | 
				
			||||||
@ -125,16 +127,13 @@ jobs:
 | 
				
			|||||||
                  throw new Error(`Unknown search type: ${type}`);
 | 
					                  throw new Error(`Unknown search type: ${type}`);
 | 
				
			||||||
              }
 | 
					              }
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
            
 | 
					 | 
				
			||||||
            // Helper function to find matching terms in text with line information
 | 
					            // Helper function to find matching terms in text with line information
 | 
				
			||||||
            function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
 | 
					            function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') {
 | 
				
			||||||
              const matches = [];
 | 
					              const matches = [];
 | 
				
			||||||
              const lines = text.split('\n');
 | 
					              const lines = text.split('\n');
 | 
				
			||||||
              
 | 
					 | 
				
			||||||
              for (const termConfig of searchTerms) {
 | 
					              for (const termConfig of searchTerms) {
 | 
				
			||||||
                let regex;
 | 
					                let regex;
 | 
				
			||||||
                let term, searchIn, pattern, description, flags;
 | 
					                let term, searchIn, pattern, description, flags;
 | 
				
			||||||
                
 | 
					 | 
				
			||||||
                // Handle different input formats (string or object)
 | 
					                // Handle different input formats (string or object)
 | 
				
			||||||
                if (typeof termConfig === 'string') {
 | 
					                if (typeof termConfig === 'string') {
 | 
				
			||||||
                  term = termConfig;
 | 
					                  term = termConfig;
 | 
				
			||||||
@ -146,21 +145,17 @@ jobs:
 | 
				
			|||||||
                  description = termConfig.description;
 | 
					                  description = termConfig.description;
 | 
				
			||||||
                  flags = termConfig.flags;
 | 
					                  flags = termConfig.flags;
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
                
 | 
					 | 
				
			||||||
                // Skip if this term shouldn't be searched in the current location
 | 
					                // Skip if this term shouldn't be searched in the current location
 | 
				
			||||||
                if (searchIn !== 'both' && searchIn !== searchLocation) {
 | 
					                if (searchIn !== 'both' && searchIn !== searchLocation) {
 | 
				
			||||||
                  continue;
 | 
					                  continue;
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
                
 | 
					 | 
				
			||||||
                // Create appropriate regex
 | 
					                // Create appropriate regex
 | 
				
			||||||
                if (searchType === 'regex') {
 | 
					                if (searchType === 'regex') {
 | 
				
			||||||
                  regex = new RegExp(pattern, flags || "gi");
 | 
					                  regex = new RegExp(pattern, flags || "gi");
 | 
				
			||||||
                } else {
 | 
					                } else {
 | 
				
			||||||
                  regex = createSearchRegex(term, searchType);
 | 
					                  regex = createSearchRegex(term, searchType);
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
                
 | 
					 | 
				
			||||||
                const termMatches = [];
 | 
					                const termMatches = [];
 | 
				
			||||||
                
 | 
					 | 
				
			||||||
                // Check each line for matches
 | 
					                // Check each line for matches
 | 
				
			||||||
                lines.forEach((line, lineIndex) => {
 | 
					                lines.forEach((line, lineIndex) => {
 | 
				
			||||||
                  const lineMatches = line.match(regex);
 | 
					                  const lineMatches = line.match(regex);
 | 
				
			||||||
@ -183,7 +178,6 @@ jobs:
 | 
				
			|||||||
                    });
 | 
					                    });
 | 
				
			||||||
                  }
 | 
					                  }
 | 
				
			||||||
                });
 | 
					                });
 | 
				
			||||||
                
 | 
					 | 
				
			||||||
                if (termMatches.length > 0) {
 | 
					                if (termMatches.length > 0) {
 | 
				
			||||||
                  matches.push({
 | 
					                  matches.push({
 | 
				
			||||||
                    term: term || (description || pattern),
 | 
					                    term: term || (description || pattern),
 | 
				
			||||||
@ -196,64 +190,48 @@ jobs:
 | 
				
			|||||||
                  });
 | 
					                  });
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
              }
 | 
					              }
 | 
				
			||||||
              
 | 
					 | 
				
			||||||
              return matches;
 | 
					              return matches;
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
            
 | 
					 | 
				
			||||||
            // Helper function to check if label should be added
 | 
					            // Helper function to check if label should be added
 | 
				
			||||||
            async function processLabel(labelName, config) {
 | 
					            async function processLabel(labelName, config) {
 | 
				
			||||||
              const body = context.payload.issue.body || "";
 | 
					              const body = context.payload.issue.body || "";
 | 
				
			||||||
              const title = context.payload.issue.title || "";
 | 
					              const title = context.payload.issue.title || "";
 | 
				
			||||||
              
 | 
					 | 
				
			||||||
              core.notice(`Processing label: ${labelName}`);
 | 
					              core.notice(`Processing label: ${labelName}`);
 | 
				
			||||||
              core.notice(`Issue Title: "${title}"`);
 | 
					              core.notice(`Issue Title: "${title}"`);
 | 
				
			||||||
              core.notice(`Issue Body length: ${body.length} characters`);
 | 
					              core.notice(`Issue Body length: ${body.length} characters`);
 | 
				
			||||||
              
 | 
					 | 
				
			||||||
              let shouldAddLabel = false;
 | 
					              let shouldAddLabel = false;
 | 
				
			||||||
              let allMatches = [];
 | 
					              let allMatches = [];
 | 
				
			||||||
              let reason = '';
 | 
					              let reason = '';
 | 
				
			||||||
              
 | 
					 | 
				
			||||||
              const keywords = config.keywords || [];
 | 
					              const keywords = config.keywords || [];
 | 
				
			||||||
              const substrings = config.substrings || [];
 | 
					              const substrings = config.substrings || [];
 | 
				
			||||||
              const regexPatterns = config.regexPatterns || [];
 | 
					              const regexPatterns = config.regexPatterns || [];
 | 
				
			||||||
              
 | 
					 | 
				
			||||||
              core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
 | 
					              core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`);
 | 
				
			||||||
              
 | 
					 | 
				
			||||||
              // Search in title
 | 
					              // Search in title
 | 
				
			||||||
              if (title.trim()) {
 | 
					              if (title.trim()) {
 | 
				
			||||||
                core.notice(`Searching in title: "${title}"`);
 | 
					                core.notice(`Searching in title: "${title}"`);
 | 
				
			||||||
                
 | 
					 | 
				
			||||||
                const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
 | 
					                const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title');
 | 
				
			||||||
                const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
 | 
					                const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title');
 | 
				
			||||||
                const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
 | 
					                const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title');
 | 
				
			||||||
                
 | 
					 | 
				
			||||||
                allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
 | 
					                allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches);
 | 
				
			||||||
              }
 | 
					              }
 | 
				
			||||||
              
 | 
					 | 
				
			||||||
              // Search in body
 | 
					              // Search in body
 | 
				
			||||||
              if (body.trim()) {
 | 
					              if (body.trim()) {
 | 
				
			||||||
                core.notice(`Searching in body (${body.length} characters)`);
 | 
					                core.notice(`Searching in body (${body.length} characters)`);
 | 
				
			||||||
                
 | 
					 | 
				
			||||||
                const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
 | 
					                const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body');
 | 
				
			||||||
                const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
 | 
					                const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body');
 | 
				
			||||||
                const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
 | 
					                const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body');
 | 
				
			||||||
                
 | 
					 | 
				
			||||||
                allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
 | 
					                allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches);
 | 
				
			||||||
              }
 | 
					              }
 | 
				
			||||||
              
 | 
					 | 
				
			||||||
              if (allMatches.length > 0) {
 | 
					              if (allMatches.length > 0) {
 | 
				
			||||||
                core.notice(`Found ${allMatches.length} matching term(s):`);
 | 
					                core.notice(`Found ${allMatches.length} matching term(s):`);
 | 
				
			||||||
                
 | 
					 | 
				
			||||||
                for (const termMatch of allMatches) {
 | 
					                for (const termMatch of allMatches) {
 | 
				
			||||||
                  const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
 | 
					                  const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body';
 | 
				
			||||||
                  const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
 | 
					                  const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn;
 | 
				
			||||||
                  
 | 
					 | 
				
			||||||
                  if (termMatch.searchType === 'regex') {
 | 
					                  if (termMatch.searchType === 'regex') {
 | 
				
			||||||
                    core.notice(`  📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
 | 
					                    core.notice(`  📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
 | 
				
			||||||
                  } else {
 | 
					                  } else {
 | 
				
			||||||
                    core.notice(`  📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
 | 
					                    core.notice(`  📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`);
 | 
				
			||||||
                  }
 | 
					                  }
 | 
				
			||||||
                  
 | 
					 | 
				
			||||||
                  // Show details for each match
 | 
					                  // Show details for each match
 | 
				
			||||||
                  termMatch.matches.forEach((match, index) => {
 | 
					                  termMatch.matches.forEach((match, index) => {
 | 
				
			||||||
                    core.notice(`    ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
 | 
					                    core.notice(`    ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`);
 | 
				
			||||||
@ -266,7 +244,6 @@ jobs:
 | 
				
			|||||||
                    }
 | 
					                    }
 | 
				
			||||||
                  });
 | 
					                  });
 | 
				
			||||||
                }
 | 
					                }
 | 
				
			||||||
                
 | 
					 | 
				
			||||||
                shouldAddLabel = true;
 | 
					                shouldAddLabel = true;
 | 
				
			||||||
                const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
 | 
					                const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0);
 | 
				
			||||||
                const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
 | 
					                const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0);
 | 
				
			||||||
@ -274,13 +251,10 @@ jobs:
 | 
				
			|||||||
                const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
 | 
					                const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0);
 | 
				
			||||||
                const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
 | 
					                const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0);
 | 
				
			||||||
                const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
 | 
					                const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0);
 | 
				
			||||||
                
 | 
					 | 
				
			||||||
                reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
 | 
					                reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`;
 | 
				
			||||||
              }
 | 
					              }
 | 
				
			||||||
              
 | 
					 | 
				
			||||||
              core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
 | 
					              core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`);
 | 
				
			||||||
              core.notice(`Reason: ${reason || 'No matching terms found'}`);
 | 
					              core.notice(`Reason: ${reason || 'No matching terms found'}`);
 | 
				
			||||||
              
 | 
					 | 
				
			||||||
              if (shouldAddLabel) {
 | 
					              if (shouldAddLabel) {
 | 
				
			||||||
                const existingLabels = context.payload.issue.labels.map(l => l.name);
 | 
					                const existingLabels = context.payload.issue.labels.map(l => l.name);
 | 
				
			||||||
                if (!existingLabels.includes(labelName)) {
 | 
					                if (!existingLabels.includes(labelName)) {
 | 
				
			||||||
@ -296,14 +270,92 @@ jobs:
 | 
				
			|||||||
                core.notice(`Label "${labelName}" already present.`);
 | 
					                core.notice(`Label "${labelName}" already present.`);
 | 
				
			||||||
                return false;
 | 
					                return false;
 | 
				
			||||||
              }
 | 
					              }
 | 
				
			||||||
              
 | 
					 | 
				
			||||||
              core.notice(`No matching terms found for label "${labelName}".`);
 | 
					              core.notice(`No matching terms found for label "${labelName}".`);
 | 
				
			||||||
              return false;
 | 
					              return false;
 | 
				
			||||||
            }
 | 
					            }
 | 
				
			||||||
            
 | 
					 | 
				
			||||||
            // Process all configured labels
 | 
					            // Process all configured labels
 | 
				
			||||||
            const processLabels = Object.entries(labelConfig)
 | 
					            const labelsAddedResults = await Promise.all(
 | 
				
			||||||
              .map(([labelName, config]) => processLabel(labelName, config));
 | 
					              Object.entries(labelConfig).map(([labelName, config]) => 
 | 
				
			||||||
            const labelsAdded = await Promise.all(processLabels);
 | 
					                processLabel(labelName, config).then(added => ({ labelName, added }))
 | 
				
			||||||
            const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0);
 | 
					              )
 | 
				
			||||||
 | 
					            );
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					            const numLabelsAdded = labelsAddedResults.filter(r => r.added).length;
 | 
				
			||||||
            core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
 | 
					            core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`);
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					            // Return which labels were added for the next step
 | 
				
			||||||
 | 
					            const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName);
 | 
				
			||||||
 | 
					            core.setOutput('labels_added', JSON.stringify(addedLabels));
 | 
				
			||||||
 | 
					            return addedLabels;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      - name: CC users for labeled issues
 | 
				
			||||||
 | 
					        if: steps.label-step.outputs.labels_added != '[]'
 | 
				
			||||||
 | 
					        uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd  # v8.0.0
 | 
				
			||||||
 | 
					        with:
 | 
				
			||||||
 | 
					          script: |
 | 
				
			||||||
 | 
					            // Configuration: Map labels to GitHub users to CC
 | 
				
			||||||
 | 
					            // You can add multiple users per label, and multiple label configurations
 | 
				
			||||||
 | 
					            const ccConfig = {
 | 
				
			||||||
 | 
					              rocm: {
 | 
				
			||||||
 | 
					                users: ['hongxiayang', 'tjtanaa', 'vllmellm'],  // Add more users as needed: ['user1', 'user2', 'user3']
 | 
				
			||||||
 | 
					                message: 'CC {users} for ROCm-related issue'  // {users} will be replaced with @mentions
 | 
				
			||||||
 | 
					              },
 | 
				
			||||||
 | 
					              // Add more label -> user mappings here
 | 
				
			||||||
 | 
					              // Example:
 | 
				
			||||||
 | 
					              // cuda: {
 | 
				
			||||||
 | 
					              //   users: ['user1', 'user2'],
 | 
				
			||||||
 | 
					              //   message: 'CC {users} for CUDA-related issue'
 | 
				
			||||||
 | 
					              // },
 | 
				
			||||||
 | 
					              // performance: {
 | 
				
			||||||
 | 
					              //   users: ['perfexpert'],
 | 
				
			||||||
 | 
					              //   message: 'CC {users} for performance issue'
 | 
				
			||||||
 | 
					              // },
 | 
				
			||||||
 | 
					            };
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					            const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}');
 | 
				
			||||||
 | 
					            core.notice(`Labels added: ${labelsAdded.join(', ')}`);
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					            // Get existing comments to check for already mentioned users
 | 
				
			||||||
 | 
					            const comments = await github.rest.issues.listComments({
 | 
				
			||||||
 | 
					              owner: context.repo.owner,
 | 
				
			||||||
 | 
					              repo: context.repo.repo,
 | 
				
			||||||
 | 
					              issue_number: context.issue.number,
 | 
				
			||||||
 | 
					            });
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					            const issueBody = context.payload.issue.body || '';
 | 
				
			||||||
 | 
					            const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n');
 | 
				
			||||||
 | 
					            
 | 
				
			||||||
 | 
					            // Process each label that was added
 | 
				
			||||||
 | 
					            for (const label of labelsAdded) {
 | 
				
			||||||
 | 
					              if (ccConfig[label]) {
 | 
				
			||||||
 | 
					                const config = ccConfig[label];
 | 
				
			||||||
 | 
					                const usersToMention = [];
 | 
				
			||||||
 | 
					                
 | 
				
			||||||
 | 
					                // Check which users haven't been mentioned yet
 | 
				
			||||||
 | 
					                for (const user of config.users) {
 | 
				
			||||||
 | 
					                  const mentionPattern = new RegExp(`@${user}\\b`, 'i');
 | 
				
			||||||
 | 
					                  if (!mentionPattern.test(allExistingText)) {
 | 
				
			||||||
 | 
					                    usersToMention.push(user);
 | 
				
			||||||
 | 
					                  } else {
 | 
				
			||||||
 | 
					                    core.notice(`@${user} already mentioned for label "${label}", skipping`);
 | 
				
			||||||
 | 
					                  }
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
 | 
					                
 | 
				
			||||||
 | 
					                // Post comment if there are users to mention
 | 
				
			||||||
 | 
					                if (usersToMention.length > 0) {
 | 
				
			||||||
 | 
					                  const mentions = usersToMention.map(u => `@${u}`).join(' ');
 | 
				
			||||||
 | 
					                  const message = config.message.replace('{users}', mentions);
 | 
				
			||||||
 | 
					                  
 | 
				
			||||||
 | 
					                  await github.rest.issues.createComment({
 | 
				
			||||||
 | 
					                    owner: context.repo.owner,
 | 
				
			||||||
 | 
					                    repo: context.repo.repo,
 | 
				
			||||||
 | 
					                    issue_number: context.issue.number,
 | 
				
			||||||
 | 
					                    body: message
 | 
				
			||||||
 | 
					                  });
 | 
				
			||||||
 | 
					                  
 | 
				
			||||||
 | 
					                  core.notice(`CC comment added for label "${label}": ${mentions}`);
 | 
				
			||||||
 | 
					                } else {
 | 
				
			||||||
 | 
					                  core.notice(`All users for label "${label}" already mentioned, skipping comment`);
 | 
				
			||||||
 | 
					                }
 | 
				
			||||||
 | 
					              }
 | 
				
			||||||
 | 
					            }
 | 
				
			||||||
							
								
								
									
										2
									
								
								.github/workflows/stale.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/stale.yml
									
									
									
									
										vendored
									
									
								
							@ -13,7 +13,7 @@ jobs:
 | 
				
			|||||||
      actions: write
 | 
					      actions: write
 | 
				
			||||||
    runs-on: ubuntu-latest
 | 
					    runs-on: ubuntu-latest
 | 
				
			||||||
    steps:
 | 
					    steps:
 | 
				
			||||||
      - uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0
 | 
					      - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0
 | 
				
			||||||
        with:
 | 
					        with:
 | 
				
			||||||
          # Increasing this value ensures that changes to this workflow
 | 
					          # Increasing this value ensures that changes to this workflow
 | 
				
			||||||
          # propagate to all issues and PRs in days rather than months
 | 
					          # propagate to all issues and PRs in days rather than months
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitignore
									
									
									
									
										vendored
									
									
								
							@ -94,6 +94,9 @@ ipython_config.py
 | 
				
			|||||||
# generated files
 | 
					# generated files
 | 
				
			||||||
**/generated/**
 | 
					**/generated/**
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# uv
 | 
				
			||||||
 | 
					uv.lock
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# pyenv
 | 
					# pyenv
 | 
				
			||||||
#   For a library or package, you might want to ignore these files since the code is
 | 
					#   For a library or package, you might want to ignore these files since the code is
 | 
				
			||||||
#   intended to run in multiple environments; otherwise, check them in:
 | 
					#   intended to run in multiple environments; otherwise, check them in:
 | 
				
			||||||
 | 
				
			|||||||
@ -4,7 +4,6 @@ MD013: false
 | 
				
			|||||||
MD024:
 | 
					MD024:
 | 
				
			||||||
  siblings_only: true
 | 
					  siblings_only: true
 | 
				
			||||||
MD033: false
 | 
					MD033: false
 | 
				
			||||||
MD042: false
 | 
					 | 
				
			||||||
MD045: false
 | 
					MD045: false
 | 
				
			||||||
MD046: false
 | 
					MD046: false
 | 
				
			||||||
MD051: false
 | 
					MD051: false
 | 
				
			||||||
 | 
				
			|||||||
@ -6,30 +6,19 @@ default_stages:
 | 
				
			|||||||
  - manual # Run in CI
 | 
					  - manual # Run in CI
 | 
				
			||||||
exclude: 'vllm/third_party/.*'
 | 
					exclude: 'vllm/third_party/.*'
 | 
				
			||||||
repos:
 | 
					repos:
 | 
				
			||||||
- repo: https://github.com/google/yapf
 | 
					 | 
				
			||||||
  rev: v0.43.0
 | 
					 | 
				
			||||||
  hooks:
 | 
					 | 
				
			||||||
  - id: yapf
 | 
					 | 
				
			||||||
    args: [--in-place, --verbose]
 | 
					 | 
				
			||||||
    # Keep the same list from yapfignore here to avoid yapf failing without any inputs
 | 
					 | 
				
			||||||
    exclude: '(.buildkite|benchmarks|build|examples)/.*'
 | 
					 | 
				
			||||||
- repo: https://github.com/astral-sh/ruff-pre-commit
 | 
					- repo: https://github.com/astral-sh/ruff-pre-commit
 | 
				
			||||||
  rev: v0.11.7
 | 
					  rev: v0.14.0
 | 
				
			||||||
  hooks:
 | 
					  hooks:
 | 
				
			||||||
  - id: ruff
 | 
					  - id: ruff-check
 | 
				
			||||||
    args: [--output-format, github, --fix]
 | 
					    args: [--output-format, github, --fix]
 | 
				
			||||||
  - id: ruff-format
 | 
					  - id: ruff-format
 | 
				
			||||||
    files: ^(.buildkite|benchmarks|examples)/.*
 | 
					 | 
				
			||||||
- repo: https://github.com/crate-ci/typos
 | 
					- repo: https://github.com/crate-ci/typos
 | 
				
			||||||
  rev: v1.35.5
 | 
					  rev: v1.38.1
 | 
				
			||||||
  hooks:
 | 
					  hooks:
 | 
				
			||||||
  - id: typos
 | 
					  - id: typos
 | 
				
			||||||
- repo: https://github.com/PyCQA/isort
 | 
					    args: [--force-exclude]
 | 
				
			||||||
  rev: 6.0.1
 | 
					 | 
				
			||||||
  hooks:
 | 
					 | 
				
			||||||
  - id: isort
 | 
					 | 
				
			||||||
- repo: https://github.com/pre-commit/mirrors-clang-format
 | 
					- repo: https://github.com/pre-commit/mirrors-clang-format
 | 
				
			||||||
  rev: v20.1.3
 | 
					  rev: v21.1.2
 | 
				
			||||||
  hooks:
 | 
					  hooks:
 | 
				
			||||||
  - id: clang-format
 | 
					  - id: clang-format
 | 
				
			||||||
    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
 | 
					    exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
 | 
				
			||||||
@ -46,10 +35,10 @@ repos:
 | 
				
			|||||||
  hooks:
 | 
					  hooks:
 | 
				
			||||||
  - id: actionlint
 | 
					  - id: actionlint
 | 
				
			||||||
- repo: https://github.com/astral-sh/uv-pre-commit
 | 
					- repo: https://github.com/astral-sh/uv-pre-commit
 | 
				
			||||||
  rev: 0.6.17
 | 
					  rev: 0.9.1
 | 
				
			||||||
  hooks:
 | 
					  hooks:
 | 
				
			||||||
    - id: pip-compile
 | 
					    - id: pip-compile
 | 
				
			||||||
      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128]
 | 
					      args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu129, --python-platform, x86_64-manylinux_2_28]
 | 
				
			||||||
      files: ^requirements/test\.(in|txt)$
 | 
					      files: ^requirements/test\.(in|txt)$
 | 
				
			||||||
- repo: local
 | 
					- repo: local
 | 
				
			||||||
  hooks:
 | 
					  hooks:
 | 
				
			||||||
@ -60,38 +49,32 @@ repos:
 | 
				
			|||||||
    files: ^requirements/test\.(in|txt)$
 | 
					    files: ^requirements/test\.(in|txt)$
 | 
				
			||||||
  - id: mypy-local
 | 
					  - id: mypy-local
 | 
				
			||||||
    name: Run mypy for local Python installation
 | 
					    name: Run mypy for local Python installation
 | 
				
			||||||
    entry: tools/mypy.sh 0 "local"
 | 
					    entry: python tools/pre_commit/mypy.py 0 "local"
 | 
				
			||||||
    language: python
 | 
					 | 
				
			||||||
    types: [python]
 | 
					 | 
				
			||||||
    additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
 | 
					 | 
				
			||||||
    stages: [pre-commit] # Don't run in CI
 | 
					    stages: [pre-commit] # Don't run in CI
 | 
				
			||||||
  - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
 | 
					    <<: &mypy_common
 | 
				
			||||||
    name: Run mypy for Python 3.9
 | 
					 | 
				
			||||||
    entry: tools/mypy.sh 1 "3.9"
 | 
					 | 
				
			||||||
      language: python
 | 
					      language: python
 | 
				
			||||||
    types: [python]
 | 
					      types_or: [python, pyi]
 | 
				
			||||||
    additional_dependencies: *mypy_deps
 | 
					      require_serial: true
 | 
				
			||||||
    stages: [manual] # Only run in CI
 | 
					      additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic]
 | 
				
			||||||
  - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
 | 
					  - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
 | 
				
			||||||
    name: Run mypy for Python 3.10
 | 
					    name: Run mypy for Python 3.10
 | 
				
			||||||
    entry: tools/mypy.sh 1 "3.10"
 | 
					    entry: python tools/pre_commit/mypy.py 1 "3.10"
 | 
				
			||||||
    language: python
 | 
					    <<: *mypy_common
 | 
				
			||||||
    types: [python]
 | 
					 | 
				
			||||||
    additional_dependencies: *mypy_deps
 | 
					 | 
				
			||||||
    stages: [manual] # Only run in CI
 | 
					    stages: [manual] # Only run in CI
 | 
				
			||||||
  - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
 | 
					  - id: mypy-3.11 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
 | 
				
			||||||
    name: Run mypy for Python 3.11
 | 
					    name: Run mypy for Python 3.11
 | 
				
			||||||
    entry: tools/mypy.sh 1 "3.11"
 | 
					    entry: python tools/pre_commit/mypy.py 1 "3.11"
 | 
				
			||||||
    language: python
 | 
					    <<: *mypy_common
 | 
				
			||||||
    types: [python]
 | 
					 | 
				
			||||||
    additional_dependencies: *mypy_deps
 | 
					 | 
				
			||||||
    stages: [manual] # Only run in CI
 | 
					    stages: [manual] # Only run in CI
 | 
				
			||||||
  - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
 | 
					  - id: mypy-3.12 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
 | 
				
			||||||
    name: Run mypy for Python 3.12
 | 
					    name: Run mypy for Python 3.12
 | 
				
			||||||
    entry: tools/mypy.sh 1 "3.12"
 | 
					    entry: python tools/pre_commit/mypy.py 1 "3.12"
 | 
				
			||||||
    language: python
 | 
					    <<: *mypy_common
 | 
				
			||||||
    types: [python]
 | 
					    stages: [manual] # Only run in CI
 | 
				
			||||||
    additional_dependencies: *mypy_deps
 | 
					  - id: mypy-3.13 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
 | 
				
			||||||
 | 
					    name: Run mypy for Python 3.13
 | 
				
			||||||
 | 
					    entry: python tools/pre_commit/mypy.py 1 "3.13"
 | 
				
			||||||
 | 
					    <<: *mypy_common
 | 
				
			||||||
    stages: [manual] # Only run in CI
 | 
					    stages: [manual] # Only run in CI
 | 
				
			||||||
  - id: shellcheck
 | 
					  - id: shellcheck
 | 
				
			||||||
    name: Lint shell scripts
 | 
					    name: Lint shell scripts
 | 
				
			||||||
@ -155,18 +138,15 @@ repos:
 | 
				
			|||||||
    additional_dependencies: [regex]
 | 
					    additional_dependencies: [regex]
 | 
				
			||||||
  - id: check-pickle-imports
 | 
					  - id: check-pickle-imports
 | 
				
			||||||
    name: Prevent new pickle/cloudpickle imports
 | 
					    name: Prevent new pickle/cloudpickle imports
 | 
				
			||||||
    entry: python tools/check_pickle_imports.py
 | 
					    entry: python tools/pre_commit/check_pickle_imports.py
 | 
				
			||||||
    language: python
 | 
					    language: python
 | 
				
			||||||
    types: [python]
 | 
					    types: [python]
 | 
				
			||||||
    pass_filenames: false
 | 
					    additional_dependencies: [regex]
 | 
				
			||||||
    additional_dependencies: [pathspec, regex]
 | 
					 | 
				
			||||||
  - id: validate-config
 | 
					  - id: validate-config
 | 
				
			||||||
    name: Validate configuration has default values and that each field has a docstring
 | 
					    name: Validate configuration has default values and that each field has a docstring
 | 
				
			||||||
    entry: python tools/validate_config.py
 | 
					    entry: python tools/validate_config.py
 | 
				
			||||||
    language: python
 | 
					    language: python
 | 
				
			||||||
    types: [python]
 | 
					    additional_dependencies: [regex]
 | 
				
			||||||
    pass_filenames: true
 | 
					 | 
				
			||||||
    files: vllm/config.py|tests/test_config.py|vllm/entrypoints/openai/cli_args.py
 | 
					 | 
				
			||||||
  # Keep `suggestion` last
 | 
					  # Keep `suggestion` last
 | 
				
			||||||
  - id: suggestion
 | 
					  - id: suggestion
 | 
				
			||||||
    name: Suggestion
 | 
					    name: Suggestion
 | 
				
			||||||
 | 
				
			|||||||
@ -13,6 +13,7 @@ build:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
mkdocs:
 | 
					mkdocs:
 | 
				
			||||||
  configuration: mkdocs.yaml
 | 
					  configuration: mkdocs.yaml
 | 
				
			||||||
 | 
					  fail_on_warning: true
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Optionally declare the Python requirements required to build your docs
 | 
					# Optionally declare the Python requirements required to build your docs
 | 
				
			||||||
python:
 | 
					python:
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										139
									
								
								CMakeLists.txt
									
									
									
									
									
								
							
							
						
						
									
										139
									
								
								CMakeLists.txt
									
									
									
									
									
								
							@ -13,6 +13,10 @@ cmake_minimum_required(VERSION 3.26)
 | 
				
			|||||||
# cmake --install . --component _C
 | 
					# cmake --install . --component _C
 | 
				
			||||||
project(vllm_extensions LANGUAGES CXX)
 | 
					project(vllm_extensions LANGUAGES CXX)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					set(CMAKE_CXX_STANDARD 17)
 | 
				
			||||||
 | 
					set(CMAKE_CXX_STANDARD_REQUIRED ON)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
 | 
					# CUDA by default, can be overridden by using -DVLLM_TARGET_DEVICE=... (used by setup.py)
 | 
				
			||||||
set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
 | 
					set(VLLM_TARGET_DEVICE "cuda" CACHE STRING "Target device backend for vLLM")
 | 
				
			||||||
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 | 
					message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 | 
				
			||||||
@ -30,10 +34,10 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
 | 
				
			|||||||
# Supported python versions.  These versions will be searched in order, the
 | 
					# Supported python versions.  These versions will be searched in order, the
 | 
				
			||||||
# first match will be selected.  These should be kept in sync with setup.py.
 | 
					# first match will be selected.  These should be kept in sync with setup.py.
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13")
 | 
					set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Supported AMD GPU architectures.
 | 
					# Supported AMD GPU architectures.
 | 
				
			||||||
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201")
 | 
					set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
# Supported/expected torch versions for CUDA/ROCm.
 | 
					# Supported/expected torch versions for CUDA/ROCm.
 | 
				
			||||||
@ -45,8 +49,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1
 | 
				
			|||||||
# requirements.txt files and should be kept consistent.  The ROCm torch
 | 
					# requirements.txt files and should be kept consistent.  The ROCm torch
 | 
				
			||||||
# versions are derived from docker/Dockerfile.rocm
 | 
					# versions are derived from docker/Dockerfile.rocm
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
set(TORCH_SUPPORTED_VERSION_CUDA "2.8.0")
 | 
					set(TORCH_SUPPORTED_VERSION_CUDA "2.9.0")
 | 
				
			||||||
set(TORCH_SUPPORTED_VERSION_ROCM "2.8.0")
 | 
					set(TORCH_SUPPORTED_VERSION_ROCM "2.9.0")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
# Try to find python package with an executable that exactly matches
 | 
					# Try to find python package with an executable that exactly matches
 | 
				
			||||||
@ -82,6 +86,9 @@ find_package(Torch REQUIRED)
 | 
				
			|||||||
# Supported NVIDIA architectures.
 | 
					# Supported NVIDIA architectures.
 | 
				
			||||||
# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
 | 
					# This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined
 | 
				
			||||||
if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
 | 
					if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
 | 
				
			||||||
 | 
					   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
 | 
				
			||||||
 | 
					  set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0")
 | 
				
			||||||
 | 
					elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND
 | 
				
			||||||
   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
 | 
					   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
 | 
				
			||||||
  set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
 | 
					  set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
 | 
				
			||||||
else()
 | 
					else()
 | 
				
			||||||
@ -171,6 +178,25 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
 | 
				
			|||||||
  list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
 | 
					  list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
 | 
				
			||||||
endif()
 | 
					endif()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Set compression mode for CUDA >=13.x.
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					if(VLLM_GPU_LANG STREQUAL "CUDA" AND
 | 
				
			||||||
 | 
					   DEFINED CMAKE_CUDA_COMPILER_VERSION AND
 | 
				
			||||||
 | 
					   CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
 | 
				
			||||||
 | 
					  list(APPEND VLLM_GPU_FLAGS "--compress-mode=size")
 | 
				
			||||||
 | 
					endif()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Set CUDA include flags for CXX compiler.
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
				
			||||||
 | 
					  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include")
 | 
				
			||||||
 | 
					  if(CUDA_VERSION VERSION_GREATER_EQUAL 13.0)
 | 
				
			||||||
 | 
					    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -I${CUDA_TOOLKIT_ROOT_DIR}/include/cccl")
 | 
				
			||||||
 | 
					  endif()
 | 
				
			||||||
 | 
					endif()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
 | 
					# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
 | 
				
			||||||
# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
 | 
					# setup.py will override FETCHCONTENT_BASE_DIR to play nicely with sccache.
 | 
				
			||||||
@ -243,8 +269,8 @@ set(VLLM_EXT_SRC
 | 
				
			|||||||
  "csrc/sampler.cu"
 | 
					  "csrc/sampler.cu"
 | 
				
			||||||
  "csrc/cuda_view.cu"
 | 
					  "csrc/cuda_view.cu"
 | 
				
			||||||
  "csrc/quantization/gptq/q_gemm.cu"
 | 
					  "csrc/quantization/gptq/q_gemm.cu"
 | 
				
			||||||
  "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
 | 
					  "csrc/quantization/w8a8/int8/scaled_quant.cu"
 | 
				
			||||||
  "csrc/quantization/fp8/common.cu"
 | 
					  "csrc/quantization/w8a8/fp8/common.cu"
 | 
				
			||||||
  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
 | 
					  "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu"
 | 
				
			||||||
  "csrc/quantization/gguf/gguf_kernel.cu"
 | 
					  "csrc/quantization/gguf/gguf_kernel.cu"
 | 
				
			||||||
  "csrc/quantization/activation_kernels.cu"
 | 
					  "csrc/quantization/activation_kernels.cu"
 | 
				
			||||||
@ -256,7 +282,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
				
			|||||||
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 | 
					  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
 | 
					  # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
 | 
				
			||||||
  set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use")
 | 
					  set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
 | 
					  # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
 | 
				
			||||||
  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
 | 
					  if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
 | 
				
			||||||
@ -288,14 +314,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
				
			|||||||
  list(APPEND VLLM_EXT_SRC
 | 
					  list(APPEND VLLM_EXT_SRC
 | 
				
			||||||
    "csrc/quantization/awq/gemm_kernels.cu"
 | 
					    "csrc/quantization/awq/gemm_kernels.cu"
 | 
				
			||||||
    "csrc/permute_cols.cu"
 | 
					    "csrc/permute_cols.cu"
 | 
				
			||||||
    "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
 | 
					    "csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu"
 | 
				
			||||||
    "csrc/quantization/fp4/nvfp4_quant_entry.cu"
 | 
					    "csrc/quantization/fp4/nvfp4_quant_entry.cu"
 | 
				
			||||||
    "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
 | 
					    "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
 | 
				
			||||||
    "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu"
 | 
					 | 
				
			||||||
    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
 | 
					    "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
 | 
				
			||||||
    "csrc/cutlass_extensions/common.cpp"
 | 
					    "csrc/cutlass_extensions/common.cpp"
 | 
				
			||||||
    "csrc/attention/mla/cutlass_mla_entry.cu"
 | 
					    "csrc/quantization/w8a8/fp8/per_token_group_quant.cu"
 | 
				
			||||||
    "csrc/quantization/fp8/per_token_group_quant.cu")
 | 
					    "csrc/quantization/w8a8/int8/per_token_group_quant.cu")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  set_gencode_flags_for_srcs(
 | 
					  set_gencode_flags_for_srcs(
 | 
				
			||||||
    SRCS "${VLLM_EXT_SRC}"
 | 
					    SRCS "${VLLM_EXT_SRC}"
 | 
				
			||||||
@ -399,11 +424,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
				
			|||||||
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
 | 
					  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
 | 
				
			||||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
 | 
					  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS)
 | 
				
			||||||
    set(SRCS
 | 
					    set(SRCS
 | 
				
			||||||
       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
 | 
					       "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu"
 | 
				
			||||||
       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
 | 
					       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu"
 | 
				
			||||||
       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
 | 
					       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu"
 | 
				
			||||||
       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
 | 
					       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu"
 | 
				
			||||||
       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
 | 
					       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu")
 | 
				
			||||||
    set_gencode_flags_for_srcs(
 | 
					    set_gencode_flags_for_srcs(
 | 
				
			||||||
      SRCS "${SRCS}"
 | 
					      SRCS "${SRCS}"
 | 
				
			||||||
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
 | 
					      CUDA_ARCHS "${SCALED_MM_ARCHS}")
 | 
				
			||||||
@ -427,12 +452,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  # The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
 | 
					  # The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require
 | 
				
			||||||
  # CUDA 12.8 or later
 | 
					  # CUDA 12.8 or later
 | 
				
			||||||
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
 | 
					  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
 | 
				
			||||||
 | 
					    cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}")
 | 
				
			||||||
 | 
					  else()
 | 
				
			||||||
 | 
					    cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}")
 | 
				
			||||||
 | 
					  endif()
 | 
				
			||||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
 | 
					  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
 | 
				
			||||||
    set(SRCS
 | 
					    set(SRCS
 | 
				
			||||||
      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
 | 
					      "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu"
 | 
				
			||||||
      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
 | 
					      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu"
 | 
				
			||||||
      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu"
 | 
					      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu"
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    set_gencode_flags_for_srcs(
 | 
					    set_gencode_flags_for_srcs(
 | 
				
			||||||
      SRCS "${SRCS}"
 | 
					      SRCS "${SRCS}"
 | 
				
			||||||
@ -457,12 +486,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
 | 
					  # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
 | 
				
			||||||
  # require CUDA 12.8 or later
 | 
					  # require CUDA 12.8 or later
 | 
				
			||||||
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
 | 
					  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
 | 
				
			||||||
 | 
					    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
 | 
				
			||||||
 | 
					  else()
 | 
				
			||||||
 | 
					    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
 | 
				
			||||||
 | 
					  endif()
 | 
				
			||||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
 | 
					  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
 | 
				
			||||||
    set(SRCS
 | 
					    set(SRCS
 | 
				
			||||||
      "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
 | 
					      "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu"
 | 
				
			||||||
      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
 | 
					      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu"
 | 
				
			||||||
      "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu"
 | 
					      "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu"
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    set_gencode_flags_for_srcs(
 | 
					    set_gencode_flags_for_srcs(
 | 
				
			||||||
      SRCS "${SRCS}"
 | 
					      SRCS "${SRCS}"
 | 
				
			||||||
@ -493,7 +526,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
				
			|||||||
  # subtract out the archs that are already built for 3x
 | 
					  # subtract out the archs that are already built for 3x
 | 
				
			||||||
  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
 | 
					  list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
 | 
				
			||||||
  if (SCALED_MM_2X_ARCHS)
 | 
					  if (SCALED_MM_2X_ARCHS)
 | 
				
			||||||
    set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu")
 | 
					    set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu")
 | 
				
			||||||
    set_gencode_flags_for_srcs(
 | 
					    set_gencode_flags_for_srcs(
 | 
				
			||||||
      SRCS "${SRCS}"
 | 
					      SRCS "${SRCS}"
 | 
				
			||||||
      CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
 | 
					      CUDA_ARCHS "${SCALED_MM_2X_ARCHS}")
 | 
				
			||||||
@ -537,7 +570,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  # The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
 | 
					  # The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
 | 
				
			||||||
  # CUDA 12.8 or later
 | 
					  # CUDA 12.8 or later
 | 
				
			||||||
  cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
 | 
					  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
 | 
				
			||||||
 | 
					    cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}")
 | 
				
			||||||
 | 
					  else()
 | 
				
			||||||
 | 
					    cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}")
 | 
				
			||||||
 | 
					  endif()
 | 
				
			||||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
 | 
					  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
 | 
				
			||||||
    set(SRCS
 | 
					    set(SRCS
 | 
				
			||||||
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
 | 
					      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
 | 
				
			||||||
@ -556,7 +593,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
				
			|||||||
  endif()
 | 
					  endif()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  # FP4 Archs and flags
 | 
					  # FP4 Archs and flags
 | 
				
			||||||
  cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
 | 
					  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
 | 
				
			||||||
 | 
					    cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
 | 
				
			||||||
 | 
					  else()
 | 
				
			||||||
 | 
					    cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}")
 | 
				
			||||||
 | 
					  endif()
 | 
				
			||||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
 | 
					  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
 | 
				
			||||||
    set(SRCS
 | 
					    set(SRCS
 | 
				
			||||||
      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
 | 
					      "csrc/quantization/fp4/nvfp4_quant_kernels.cu"
 | 
				
			||||||
@ -578,10 +619,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
				
			|||||||
  endif()
 | 
					  endif()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  # CUTLASS MLA Archs and flags
 | 
					  # CUTLASS MLA Archs and flags
 | 
				
			||||||
  cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}")
 | 
					  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
 | 
				
			||||||
 | 
					    cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
 | 
				
			||||||
 | 
					  else()
 | 
				
			||||||
 | 
					    cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
 | 
				
			||||||
 | 
					  endif()
 | 
				
			||||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
 | 
					  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS)
 | 
				
			||||||
    set(SRCS
 | 
					    set(SRCS
 | 
				
			||||||
      "csrc/attention/mla/cutlass_mla_kernels.cu"
 | 
					 | 
				
			||||||
      "csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
 | 
					      "csrc/attention/mla/sm100_cutlass_mla_kernel.cu")
 | 
				
			||||||
    set_gencode_flags_for_srcs(
 | 
					    set_gencode_flags_for_srcs(
 | 
				
			||||||
      SRCS "${SRCS}"
 | 
					      SRCS "${SRCS}"
 | 
				
			||||||
@ -605,7 +649,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
				
			|||||||
  # if it's possible to compile MoE kernels that use its output.
 | 
					  # if it's possible to compile MoE kernels that use its output.
 | 
				
			||||||
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
 | 
					  cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
 | 
				
			||||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
 | 
					  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
 | 
				
			||||||
    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu")
 | 
					    set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu")
 | 
				
			||||||
    set_gencode_flags_for_srcs(
 | 
					    set_gencode_flags_for_srcs(
 | 
				
			||||||
      SRCS "${SRCS}"
 | 
					      SRCS "${SRCS}"
 | 
				
			||||||
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
 | 
					      CUDA_ARCHS "${SCALED_MM_ARCHS}")
 | 
				
			||||||
@ -623,9 +667,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
				
			|||||||
    endif()
 | 
					    endif()
 | 
				
			||||||
  endif()
 | 
					  endif()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
 | 
				
			||||||
 | 
					    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}")
 | 
				
			||||||
 | 
					  else()
 | 
				
			||||||
    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
 | 
					    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
 | 
				
			||||||
 | 
					  endif()
 | 
				
			||||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
 | 
					  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
 | 
				
			||||||
    set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu")
 | 
					    set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu")
 | 
				
			||||||
    set_gencode_flags_for_srcs(
 | 
					    set_gencode_flags_for_srcs(
 | 
				
			||||||
      SRCS "${SRCS}"
 | 
					      SRCS "${SRCS}"
 | 
				
			||||||
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
 | 
					      CUDA_ARCHS "${SCALED_MM_ARCHS}")
 | 
				
			||||||
@ -644,9 +692,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
				
			|||||||
  endif()
 | 
					  endif()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  # moe_data.cu is used by all CUTLASS MoE kernels.
 | 
					  # moe_data.cu is used by all CUTLASS MoE kernels.
 | 
				
			||||||
  cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
 | 
					  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
 | 
				
			||||||
 | 
					    cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
 | 
				
			||||||
 | 
					  else()
 | 
				
			||||||
 | 
					    cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
 | 
				
			||||||
 | 
					  endif()
 | 
				
			||||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
 | 
					  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
 | 
				
			||||||
    set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
 | 
					    set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu")
 | 
				
			||||||
    set_gencode_flags_for_srcs(
 | 
					    set_gencode_flags_for_srcs(
 | 
				
			||||||
      SRCS "${SRCS}"
 | 
					      SRCS "${SRCS}"
 | 
				
			||||||
      CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
 | 
					      CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
 | 
				
			||||||
@ -663,9 +715,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
				
			|||||||
    endif()
 | 
					    endif()
 | 
				
			||||||
  endif()
 | 
					  endif()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}")
 | 
					  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0)
 | 
				
			||||||
 | 
					    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}")
 | 
				
			||||||
 | 
					  else()
 | 
				
			||||||
 | 
					    cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}")
 | 
				
			||||||
 | 
					  endif()
 | 
				
			||||||
  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
 | 
					  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS)
 | 
				
			||||||
    set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu")
 | 
					    set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu")
 | 
				
			||||||
    set_gencode_flags_for_srcs(
 | 
					    set_gencode_flags_for_srcs(
 | 
				
			||||||
      SRCS "${SRCS}"
 | 
					      SRCS "${SRCS}"
 | 
				
			||||||
      CUDA_ARCHS "${SCALED_MM_ARCHS}")
 | 
					      CUDA_ARCHS "${SCALED_MM_ARCHS}")
 | 
				
			||||||
@ -779,6 +835,17 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
				
			|||||||
    endif()
 | 
					    endif()
 | 
				
			||||||
  endif()
 | 
					  endif()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  # Hadacore kernels
 | 
				
			||||||
 | 
					  cuda_archs_loose_intersection(HADACORE_ARCHS "8.0;8.9;9.0" "${CUDA_ARCHS}")
 | 
				
			||||||
 | 
					  if(HADACORE_ARCHS)
 | 
				
			||||||
 | 
					    set(SRCS "csrc/quantization/hadamard/hadacore/hadamard_transform_cuda.cu")
 | 
				
			||||||
 | 
					    set_gencode_flags_for_srcs(
 | 
				
			||||||
 | 
					      SRCS "${SRCS}"
 | 
				
			||||||
 | 
					      CUDA_ARCHS "${HADACORE_ARCHS}")
 | 
				
			||||||
 | 
					    list(APPEND VLLM_EXT_SRC "${SRCS}")
 | 
				
			||||||
 | 
					    message(STATUS "Building hadacore")
 | 
				
			||||||
 | 
					  endif()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# if CUDA endif
 | 
					# if CUDA endif
 | 
				
			||||||
endif()
 | 
					endif()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -816,6 +883,7 @@ target_compile_definitions(_C PRIVATE CUTLASS_ENABLE_DIRECT_CUDA_DRIVER_CALL=1)
 | 
				
			|||||||
set(VLLM_MOE_EXT_SRC
 | 
					set(VLLM_MOE_EXT_SRC
 | 
				
			||||||
  "csrc/moe/torch_bindings.cpp"
 | 
					  "csrc/moe/torch_bindings.cpp"
 | 
				
			||||||
  "csrc/moe/moe_align_sum_kernels.cu"
 | 
					  "csrc/moe/moe_align_sum_kernels.cu"
 | 
				
			||||||
 | 
					  "csrc/moe/moe_lora_align_sum_kernels.cu"
 | 
				
			||||||
  "csrc/moe/topk_softmax_kernels.cu")
 | 
					  "csrc/moe/topk_softmax_kernels.cu")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
					if(VLLM_GPU_LANG STREQUAL "CUDA")
 | 
				
			||||||
@ -940,6 +1008,7 @@ endif()
 | 
				
			|||||||
# For CUDA we also build and ship some external projects.
 | 
					# For CUDA we also build and ship some external projects.
 | 
				
			||||||
if (VLLM_GPU_LANG STREQUAL "CUDA")
 | 
					if (VLLM_GPU_LANG STREQUAL "CUDA")
 | 
				
			||||||
    include(cmake/external_projects/flashmla.cmake)
 | 
					    include(cmake/external_projects/flashmla.cmake)
 | 
				
			||||||
 | 
					    include(cmake/external_projects/qutlass.cmake)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # vllm-flash-attn should be last as it overwrites some CMake functions
 | 
					    # vllm-flash-attn should be last as it overwrites some CMake functions
 | 
				
			||||||
    include(cmake/external_projects/vllm_flash_attn.cmake)
 | 
					    include(cmake/external_projects/vllm_flash_attn.cmake)
 | 
				
			||||||
 | 
				
			|||||||
@ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
*Latest News* 🔥
 | 
					*Latest News* 🔥
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing).
 | 
				
			||||||
- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
 | 
					- [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA).
 | 
				
			||||||
- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
 | 
					- [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing).
 | 
				
			||||||
- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
 | 
					- [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH).
 | 
				
			||||||
@ -81,7 +82,7 @@ vLLM is flexible and easy to use with:
 | 
				
			|||||||
- Tensor, pipeline, data and expert parallelism support for distributed inference
 | 
					- Tensor, pipeline, data and expert parallelism support for distributed inference
 | 
				
			||||||
- Streaming outputs
 | 
					- Streaming outputs
 | 
				
			||||||
- OpenAI-compatible API server
 | 
					- OpenAI-compatible API server
 | 
				
			||||||
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron
 | 
					- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
 | 
				
			||||||
- Prefix caching support
 | 
					- Prefix caching support
 | 
				
			||||||
- Multi-LoRA support
 | 
					- Multi-LoRA support
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -148,6 +149,7 @@ Compute Resources:
 | 
				
			|||||||
- Trainy
 | 
					- Trainy
 | 
				
			||||||
- UC Berkeley
 | 
					- UC Berkeley
 | 
				
			||||||
- UC San Diego
 | 
					- UC San Diego
 | 
				
			||||||
 | 
					- Volcengine
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Slack Sponsor: Anyscale
 | 
					Slack Sponsor: Anyscale
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -1,874 +1,20 @@
 | 
				
			|||||||
# Benchmarking vLLM
 | 
					# Benchmarks
 | 
				
			||||||
 | 
					
 | 
				
			||||||
This README guides you through running benchmark tests with the extensive
 | 
					This directory used to contain vLLM's benchmark scripts and utilities for performance testing and evaluation.
 | 
				
			||||||
datasets supported on vLLM. It’s a living document, updated as new features and datasets
 | 
					 | 
				
			||||||
become available.
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Dataset Overview
 | 
					## Contents
 | 
				
			||||||
 | 
					
 | 
				
			||||||
<table style="width:100%; border-collapse: collapse;">
 | 
					- **Serving benchmarks**: Scripts for testing online inference performance (latency, throughput)
 | 
				
			||||||
  <thead>
 | 
					- **Throughput benchmarks**: Scripts for testing offline batch inference performance
 | 
				
			||||||
    <tr>
 | 
					- **Specialized benchmarks**: Tools for testing specific features like structured output, prefix caching, long document QA, request prioritization, and multi-modal inference
 | 
				
			||||||
      <th style="width:15%; text-align: left;">Dataset</th>
 | 
					- **Dataset utilities**: Framework for loading and sampling from various benchmark datasets (ShareGPT, HuggingFace datasets, synthetic data, etc.)
 | 
				
			||||||
      <th style="width:10%; text-align: center;">Online</th>
 | 
					 | 
				
			||||||
      <th style="width:10%; text-align: center;">Offline</th>
 | 
					 | 
				
			||||||
      <th style="width:65%; text-align: left;">Data Path</th>
 | 
					 | 
				
			||||||
    </tr>
 | 
					 | 
				
			||||||
  </thead>
 | 
					 | 
				
			||||||
  <tbody>
 | 
					 | 
				
			||||||
    <tr>
 | 
					 | 
				
			||||||
      <td><strong>ShareGPT</strong></td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td><code>wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json</code></td>
 | 
					 | 
				
			||||||
    </tr>
 | 
					 | 
				
			||||||
    <tr>
 | 
					 | 
				
			||||||
      <td><strong>ShareGPT4V (Image)</strong></td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td>
 | 
					 | 
				
			||||||
        <code>wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json</code>
 | 
					 | 
				
			||||||
        <br>
 | 
					 | 
				
			||||||
        <div>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:</div>
 | 
					 | 
				
			||||||
        <code>wget http://images.cocodataset.org/zips/train2017.zip</code>
 | 
					 | 
				
			||||||
      </td>
 | 
					 | 
				
			||||||
    </tr>
 | 
					 | 
				
			||||||
        <tr>
 | 
					 | 
				
			||||||
      <td><strong>ShareGPT4Video (Video)</strong></td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td>
 | 
					 | 
				
			||||||
        <code>git clone https://huggingface.co/datasets/ShareGPT4Video/ShareGPT4Video</code>
 | 
					 | 
				
			||||||
      </td>
 | 
					 | 
				
			||||||
    </tr>
 | 
					 | 
				
			||||||
    <tr>
 | 
					 | 
				
			||||||
      <td><strong>BurstGPT</strong></td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td><code>wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv</code></td>
 | 
					 | 
				
			||||||
    </tr>
 | 
					 | 
				
			||||||
    <tr>
 | 
					 | 
				
			||||||
      <td><strong>Sonnet (deprecated)</strong></td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td>Local file: <code>benchmarks/sonnet.txt</code></td>
 | 
					 | 
				
			||||||
    </tr>
 | 
					 | 
				
			||||||
    <tr>
 | 
					 | 
				
			||||||
      <td><strong>Random</strong></td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td><code>synthetic</code></td>
 | 
					 | 
				
			||||||
    </tr>
 | 
					 | 
				
			||||||
    <tr>
 | 
					 | 
				
			||||||
      <td><strong>RandomMultiModal (Image/Video)</strong></td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">🟡</td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">🚧</td>
 | 
					 | 
				
			||||||
      <td><code>synthetic</code> </td>
 | 
					 | 
				
			||||||
    </tr>
 | 
					 | 
				
			||||||
    <tr>
 | 
					 | 
				
			||||||
      <td><strong>Prefix Repetition</strong></td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td><code>synthetic</code></td>
 | 
					 | 
				
			||||||
    </tr>
 | 
					 | 
				
			||||||
    <tr>
 | 
					 | 
				
			||||||
      <td><strong>HuggingFace-VisionArena</strong></td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td><code>lmarena-ai/VisionArena-Chat</code></td>
 | 
					 | 
				
			||||||
    </tr>
 | 
					 | 
				
			||||||
    <tr>
 | 
					 | 
				
			||||||
      <td><strong>HuggingFace-InstructCoder</strong></td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td><code>likaixin/InstructCoder</code></td>
 | 
					 | 
				
			||||||
    </tr>
 | 
					 | 
				
			||||||
      <tr>
 | 
					 | 
				
			||||||
      <td><strong>HuggingFace-AIMO</strong></td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td><code>AI-MO/aimo-validation-aime</code> , <code>AI-MO/NuminaMath-1.5</code>, <code>AI-MO/NuminaMath-CoT</code></td>
 | 
					 | 
				
			||||||
    </tr>
 | 
					 | 
				
			||||||
    <tr>
 | 
					 | 
				
			||||||
      <td><strong>HuggingFace-Other</strong></td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
 | 
					 | 
				
			||||||
    </tr>
 | 
					 | 
				
			||||||
    <tr>
 | 
					 | 
				
			||||||
      <td><strong>HuggingFace-MTBench</strong></td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td><code>philschmid/mt-bench</code></td>
 | 
					 | 
				
			||||||
    </tr>
 | 
					 | 
				
			||||||
    <tr>
 | 
					 | 
				
			||||||
      <td><strong>HuggingFace-Blazedit</strong></td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td><code>vdaita/edit_5k_char</code>, <code>vdaita/edit_10k_char</code></td>
 | 
					 | 
				
			||||||
    </tr>
 | 
					 | 
				
			||||||
    <tr>
 | 
					 | 
				
			||||||
      <td><strong>Spec Bench</strong></td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td><code>wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl</code></td>
 | 
					 | 
				
			||||||
    </tr>
 | 
					 | 
				
			||||||
    <tr>
 | 
					 | 
				
			||||||
      <td><strong>Custom</strong></td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td style="text-align: center;">✅</td>
 | 
					 | 
				
			||||||
      <td>Local file: <code>data.jsonl</code></td>
 | 
					 | 
				
			||||||
    </tr>
 | 
					 | 
				
			||||||
  </tbody>
 | 
					 | 
				
			||||||
</table>
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
✅: supported
 | 
					## Usage
 | 
				
			||||||
 | 
					
 | 
				
			||||||
🟡: Partial support
 | 
					For detailed usage instructions, examples, and dataset information, see the [Benchmark CLI documentation](https://docs.vllm.ai/en/latest/contributing/benchmarks.html#benchmark-cli).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
🚧: to be supported
 | 
					For full CLI reference see:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`.
 | 
					- <https://docs.vllm.ai/en/latest/cli/bench/latency.html>
 | 
				
			||||||
For local `dataset-path`, please set `hf-name` to its Hugging Face ID like
 | 
					- <https://docs.vllm.ai/en/latest/cli/bench/serve.html>
 | 
				
			||||||
 | 
					- <https://docs.vllm.ai/en/latest/cli/bench/throughput.html>
 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
--dataset-path /datasets/VisionArena-Chat/ --hf-name lmarena-ai/VisionArena-Chat
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
## 🚀 Example - Online Benchmark
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
<details>
 | 
					 | 
				
			||||||
<summary>Show more</summary>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
<br/>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
First start serving your model
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
vllm serve NousResearch/Hermes-3-Llama-3.1-8B
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Then run the benchmarking script
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
# download dataset
 | 
					 | 
				
			||||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 | 
					 | 
				
			||||||
vllm bench serve \
 | 
					 | 
				
			||||||
  --backend vllm \
 | 
					 | 
				
			||||||
  --model NousResearch/Hermes-3-Llama-3.1-8B \
 | 
					 | 
				
			||||||
  --endpoint /v1/completions \
 | 
					 | 
				
			||||||
  --dataset-name sharegpt \
 | 
					 | 
				
			||||||
  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
 | 
					 | 
				
			||||||
  --num-prompts 10
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
If successful, you will see the following output
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```text
 | 
					 | 
				
			||||||
============ Serving Benchmark Result ============
 | 
					 | 
				
			||||||
Successful requests:                     10
 | 
					 | 
				
			||||||
Benchmark duration (s):                  5.78
 | 
					 | 
				
			||||||
Total input tokens:                      1369
 | 
					 | 
				
			||||||
Total generated tokens:                  2212
 | 
					 | 
				
			||||||
Request throughput (req/s):              1.73
 | 
					 | 
				
			||||||
Output token throughput (tok/s):         382.89
 | 
					 | 
				
			||||||
Total Token throughput (tok/s):          619.85
 | 
					 | 
				
			||||||
---------------Time to First Token----------------
 | 
					 | 
				
			||||||
Mean TTFT (ms):                          71.54
 | 
					 | 
				
			||||||
Median TTFT (ms):                        73.88
 | 
					 | 
				
			||||||
P99 TTFT (ms):                           79.49
 | 
					 | 
				
			||||||
-----Time per Output Token (excl. 1st token)------
 | 
					 | 
				
			||||||
Mean TPOT (ms):                          7.91
 | 
					 | 
				
			||||||
Median TPOT (ms):                        7.96
 | 
					 | 
				
			||||||
P99 TPOT (ms):                           8.03
 | 
					 | 
				
			||||||
---------------Inter-token Latency----------------
 | 
					 | 
				
			||||||
Mean ITL (ms):                           7.74
 | 
					 | 
				
			||||||
Median ITL (ms):                         7.70
 | 
					 | 
				
			||||||
P99 ITL (ms):                            8.39
 | 
					 | 
				
			||||||
==================================================
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### Custom Dataset
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```json
 | 
					 | 
				
			||||||
{"prompt": "What is the capital of India?"}
 | 
					 | 
				
			||||||
{"prompt": "What is the capital of Iran?"}
 | 
					 | 
				
			||||||
{"prompt": "What is the capital of China?"}
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
# start server
 | 
					 | 
				
			||||||
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
# run benchmarking script
 | 
					 | 
				
			||||||
vllm bench serve --port 9001 --save-result --save-detailed \
 | 
					 | 
				
			||||||
  --backend vllm \
 | 
					 | 
				
			||||||
  --model meta-llama/Llama-3.1-8B-Instruct \
 | 
					 | 
				
			||||||
  --endpoint /v1/completions \
 | 
					 | 
				
			||||||
  --dataset-name custom \
 | 
					 | 
				
			||||||
  --dataset-path <path-to-your-data-jsonl> \
 | 
					 | 
				
			||||||
  --custom-skip-chat-template \
 | 
					 | 
				
			||||||
  --num-prompts 80 \
 | 
					 | 
				
			||||||
  --max-concurrency 1 \
 | 
					 | 
				
			||||||
  --temperature=0.3 \
 | 
					 | 
				
			||||||
  --top-p=0.75 \
 | 
					 | 
				
			||||||
  --result-dir "./log/"
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### VisionArena Benchmark for Vision Language Models
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
# need a model with vision capability here
 | 
					 | 
				
			||||||
vllm serve Qwen/Qwen2-VL-7B-Instruct
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
vllm bench serve \
 | 
					 | 
				
			||||||
  --backend openai-chat \
 | 
					 | 
				
			||||||
  --endpoint-type openai-chat \
 | 
					 | 
				
			||||||
  --model Qwen/Qwen2-VL-7B-Instruct \
 | 
					 | 
				
			||||||
  --endpoint /v1/chat/completions \
 | 
					 | 
				
			||||||
  --dataset-name hf \
 | 
					 | 
				
			||||||
  --dataset-path lmarena-ai/VisionArena-Chat \
 | 
					 | 
				
			||||||
  --hf-split train \
 | 
					 | 
				
			||||||
  --num-prompts 1000
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### InstructCoder Benchmark with Speculative Decoding
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
``` bash
 | 
					 | 
				
			||||||
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
 | 
					 | 
				
			||||||
    --speculative-config $'{"method": "ngram",
 | 
					 | 
				
			||||||
    "num_speculative_tokens": 5, "prompt_lookup_max": 5,
 | 
					 | 
				
			||||||
    "prompt_lookup_min": 2}'
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
``` bash
 | 
					 | 
				
			||||||
vllm bench serve \
 | 
					 | 
				
			||||||
    --model meta-llama/Meta-Llama-3-8B-Instruct \
 | 
					 | 
				
			||||||
    --dataset-name hf \
 | 
					 | 
				
			||||||
    --dataset-path likaixin/InstructCoder \
 | 
					 | 
				
			||||||
    --num-prompts 2048
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### Spec Bench Benchmark with Speculative Decoding
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
``` bash
 | 
					 | 
				
			||||||
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
 | 
					 | 
				
			||||||
    --speculative-config $'{"method": "ngram",
 | 
					 | 
				
			||||||
    "num_speculative_tokens": 5, "prompt_lookup_max": 5,
 | 
					 | 
				
			||||||
    "prompt_lookup_min": 2}'
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[SpecBench dataset](https://github.com/hemingkx/Spec-Bench)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Run all categories:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
``` bash
 | 
					 | 
				
			||||||
# Download the dataset using:
 | 
					 | 
				
			||||||
# wget https://raw.githubusercontent.com/hemingkx/Spec-Bench/refs/heads/main/data/spec_bench/question.jsonl
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
vllm bench serve \
 | 
					 | 
				
			||||||
    --model meta-llama/Meta-Llama-3-8B-Instruct \
 | 
					 | 
				
			||||||
    --dataset-name spec_bench \ 
 | 
					 | 
				
			||||||
    --dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \
 | 
					 | 
				
			||||||
    --num-prompts -1
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Available categories include `[writing, roleplay, reasoning, math, coding, extraction, stem, humanities, translation, summarization, qa, math_reasoning, rag]`.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Run only a specific category like "summarization":
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
``` bash
 | 
					 | 
				
			||||||
vllm bench serve \
 | 
					 | 
				
			||||||
    --model meta-llama/Meta-Llama-3-8B-Instruct \
 | 
					 | 
				
			||||||
    --dataset-name spec_bench \ 
 | 
					 | 
				
			||||||
    --dataset-path "<YOUR_DOWNLOADED_PATH>/data/spec_bench/question.jsonl" \
 | 
					 | 
				
			||||||
    --num-prompts -1
 | 
					 | 
				
			||||||
    --spec-bench-category "summarization"
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### Other HuggingFaceDataset Examples
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
vllm serve Qwen/Qwen2-VL-7B-Instruct
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
`lmms-lab/LLaVA-OneVision-Data`:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
vllm bench serve \
 | 
					 | 
				
			||||||
  --backend openai-chat \
 | 
					 | 
				
			||||||
  --endpoint-type openai-chat \
 | 
					 | 
				
			||||||
  --model Qwen/Qwen2-VL-7B-Instruct \
 | 
					 | 
				
			||||||
  --endpoint /v1/chat/completions \
 | 
					 | 
				
			||||||
  --dataset-name hf \
 | 
					 | 
				
			||||||
  --dataset-path lmms-lab/LLaVA-OneVision-Data \
 | 
					 | 
				
			||||||
  --hf-split train \
 | 
					 | 
				
			||||||
  --hf-subset "chart2text(cauldron)" \
 | 
					 | 
				
			||||||
  --num-prompts 10
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
`Aeala/ShareGPT_Vicuna_unfiltered`:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
vllm bench serve \
 | 
					 | 
				
			||||||
  --backend openai-chat \
 | 
					 | 
				
			||||||
  --endpoint-type openai-chat \
 | 
					 | 
				
			||||||
  --model Qwen/Qwen2-VL-7B-Instruct \
 | 
					 | 
				
			||||||
  --endpoint /v1/chat/completions \
 | 
					 | 
				
			||||||
  --dataset-name hf \
 | 
					 | 
				
			||||||
  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
 | 
					 | 
				
			||||||
  --hf-split train \
 | 
					 | 
				
			||||||
  --num-prompts 10
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
`AI-MO/aimo-validation-aime`:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
``` bash
 | 
					 | 
				
			||||||
vllm bench serve \
 | 
					 | 
				
			||||||
    --model Qwen/QwQ-32B \
 | 
					 | 
				
			||||||
    --dataset-name hf \
 | 
					 | 
				
			||||||
    --dataset-path AI-MO/aimo-validation-aime \
 | 
					 | 
				
			||||||
    --num-prompts 10 \
 | 
					 | 
				
			||||||
    --seed 42
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
`philschmid/mt-bench`:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
``` bash
 | 
					 | 
				
			||||||
vllm bench serve \
 | 
					 | 
				
			||||||
    --model Qwen/QwQ-32B \
 | 
					 | 
				
			||||||
    --dataset-name hf \
 | 
					 | 
				
			||||||
    --dataset-path philschmid/mt-bench \
 | 
					 | 
				
			||||||
    --num-prompts 80
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
`vdaita/edit_5k_char` or `vdaita/edit_10k_char`:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
``` bash
 | 
					 | 
				
			||||||
vllm bench serve \
 | 
					 | 
				
			||||||
    --model Qwen/QwQ-32B \
 | 
					 | 
				
			||||||
    --dataset-name hf \
 | 
					 | 
				
			||||||
    --dataset-path vdaita/edit_5k_char \
 | 
					 | 
				
			||||||
    --num-prompts 90 \
 | 
					 | 
				
			||||||
    --blazedit-min-distance 0.01 \
 | 
					 | 
				
			||||||
    --blazedit-max-distance 0.99
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### Running With Sampling Parameters
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
When using OpenAI-compatible backends such as `vllm`, optional sampling
 | 
					 | 
				
			||||||
parameters can be specified. Example client command:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
vllm bench serve \
 | 
					 | 
				
			||||||
  --backend vllm \
 | 
					 | 
				
			||||||
  --model NousResearch/Hermes-3-Llama-3.1-8B \
 | 
					 | 
				
			||||||
  --endpoint /v1/completions \
 | 
					 | 
				
			||||||
  --dataset-name sharegpt \
 | 
					 | 
				
			||||||
  --dataset-path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
 | 
					 | 
				
			||||||
  --top-k 10 \
 | 
					 | 
				
			||||||
  --top-p 0.9 \
 | 
					 | 
				
			||||||
  --temperature 0.5 \
 | 
					 | 
				
			||||||
  --num-prompts 10
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### Running With Ramp-Up Request Rate
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
The benchmark tool also supports ramping up the request rate over the
 | 
					 | 
				
			||||||
duration of the benchmark run. This can be useful for stress testing the
 | 
					 | 
				
			||||||
server or finding the maximum throughput that it can handle, given some latency budget.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Two ramp-up strategies are supported:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
- `linear`: Increases the request rate linearly from a start value to an end value.
 | 
					 | 
				
			||||||
- `exponential`: Increases the request rate exponentially.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
The following arguments can be used to control the ramp-up:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
 | 
					 | 
				
			||||||
- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
 | 
					 | 
				
			||||||
- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
</details>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
## 📈 Example - Offline Throughput Benchmark
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
<details>
 | 
					 | 
				
			||||||
<summary>Show more</summary>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
<br/>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
vllm bench throughput \
 | 
					 | 
				
			||||||
  --model NousResearch/Hermes-3-Llama-3.1-8B \
 | 
					 | 
				
			||||||
  --dataset-name sonnet \
 | 
					 | 
				
			||||||
  --dataset-path vllm/benchmarks/sonnet.txt \
 | 
					 | 
				
			||||||
  --num-prompts 10
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
If successful, you will see the following output
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```text
 | 
					 | 
				
			||||||
Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
 | 
					 | 
				
			||||||
Total num prompt tokens:  5014
 | 
					 | 
				
			||||||
Total num output tokens:  1500
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### VisionArena Benchmark for Vision Language Models
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
vllm bench throughput \
 | 
					 | 
				
			||||||
  --model Qwen/Qwen2-VL-7B-Instruct \
 | 
					 | 
				
			||||||
  --backend vllm-chat \
 | 
					 | 
				
			||||||
  --dataset-name hf \
 | 
					 | 
				
			||||||
  --dataset-path lmarena-ai/VisionArena-Chat \
 | 
					 | 
				
			||||||
  --num-prompts 1000 \
 | 
					 | 
				
			||||||
  --hf-split train
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
The `num prompt tokens` now includes image token counts
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```text
 | 
					 | 
				
			||||||
Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
 | 
					 | 
				
			||||||
Total num prompt tokens:  14527
 | 
					 | 
				
			||||||
Total num output tokens:  1280
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### InstructCoder Benchmark with Speculative Decoding
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
``` bash
 | 
					 | 
				
			||||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
 | 
					 | 
				
			||||||
VLLM_USE_V1=1 \
 | 
					 | 
				
			||||||
vllm bench throughput \
 | 
					 | 
				
			||||||
    --dataset-name=hf \
 | 
					 | 
				
			||||||
    --dataset-path=likaixin/InstructCoder \
 | 
					 | 
				
			||||||
    --model=meta-llama/Meta-Llama-3-8B-Instruct \
 | 
					 | 
				
			||||||
    --input-len=1000 \
 | 
					 | 
				
			||||||
    --output-len=100 \
 | 
					 | 
				
			||||||
    --num-prompts=2048 \
 | 
					 | 
				
			||||||
    --async-engine \
 | 
					 | 
				
			||||||
    --speculative-config $'{"method": "ngram",
 | 
					 | 
				
			||||||
    "num_speculative_tokens": 5, "prompt_lookup_max": 5,
 | 
					 | 
				
			||||||
    "prompt_lookup_min": 2}'
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```text
 | 
					 | 
				
			||||||
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
 | 
					 | 
				
			||||||
Total num prompt tokens:  261136
 | 
					 | 
				
			||||||
Total num output tokens:  204800
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### Other HuggingFaceDataset Examples
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
`lmms-lab/LLaVA-OneVision-Data`:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
vllm bench throughput \
 | 
					 | 
				
			||||||
  --model Qwen/Qwen2-VL-7B-Instruct \
 | 
					 | 
				
			||||||
  --backend vllm-chat \
 | 
					 | 
				
			||||||
  --dataset-name hf \
 | 
					 | 
				
			||||||
  --dataset-path lmms-lab/LLaVA-OneVision-Data \
 | 
					 | 
				
			||||||
  --hf-split train \
 | 
					 | 
				
			||||||
  --hf-subset "chart2text(cauldron)" \
 | 
					 | 
				
			||||||
  --num-prompts 10
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
`Aeala/ShareGPT_Vicuna_unfiltered`:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
vllm bench throughput \
 | 
					 | 
				
			||||||
  --model Qwen/Qwen2-VL-7B-Instruct \
 | 
					 | 
				
			||||||
  --backend vllm-chat \
 | 
					 | 
				
			||||||
  --dataset-name hf \
 | 
					 | 
				
			||||||
  --dataset-path Aeala/ShareGPT_Vicuna_unfiltered \
 | 
					 | 
				
			||||||
  --hf-split train \
 | 
					 | 
				
			||||||
  --num-prompts 10
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
`AI-MO/aimo-validation-aime`:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
vllm bench throughput \
 | 
					 | 
				
			||||||
  --model Qwen/QwQ-32B \
 | 
					 | 
				
			||||||
  --backend vllm \
 | 
					 | 
				
			||||||
  --dataset-name hf \
 | 
					 | 
				
			||||||
  --dataset-path AI-MO/aimo-validation-aime \
 | 
					 | 
				
			||||||
  --hf-split train \
 | 
					 | 
				
			||||||
  --num-prompts 10
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Benchmark with LoRA adapters:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
``` bash
 | 
					 | 
				
			||||||
# download dataset
 | 
					 | 
				
			||||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 | 
					 | 
				
			||||||
vllm bench throughput \
 | 
					 | 
				
			||||||
  --model meta-llama/Llama-2-7b-hf \
 | 
					 | 
				
			||||||
  --backend vllm \
 | 
					 | 
				
			||||||
  --dataset_path <your data path>/ShareGPT_V3_unfiltered_cleaned_split.json \
 | 
					 | 
				
			||||||
  --dataset_name sharegpt \
 | 
					 | 
				
			||||||
  --num-prompts 10 \
 | 
					 | 
				
			||||||
  --max-loras 2 \
 | 
					 | 
				
			||||||
  --max-lora-rank 8 \
 | 
					 | 
				
			||||||
  --enable-lora \
 | 
					 | 
				
			||||||
  --lora-path yard1/llama-2-7b-sql-lora-test
 | 
					 | 
				
			||||||
  ```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
</details>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
## 🛠️ Example - Structured Output Benchmark
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
<details>
 | 
					 | 
				
			||||||
<summary>Show more</summary>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
<br/>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Benchmark the performance of structured output generation (JSON, grammar, regex).
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### Server Setup
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
vllm serve NousResearch/Hermes-3-Llama-3.1-8B
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### JSON Schema Benchmark
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
python3 benchmarks/benchmark_serving_structured_output.py \
 | 
					 | 
				
			||||||
  --backend vllm \
 | 
					 | 
				
			||||||
  --model NousResearch/Hermes-3-Llama-3.1-8B \
 | 
					 | 
				
			||||||
  --dataset json \
 | 
					 | 
				
			||||||
  --structured-output-ratio 1.0 \
 | 
					 | 
				
			||||||
  --request-rate 10 \
 | 
					 | 
				
			||||||
  --num-prompts 1000
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### Grammar-based Generation Benchmark
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
python3 benchmarks/benchmark_serving_structured_output.py \
 | 
					 | 
				
			||||||
  --backend vllm \
 | 
					 | 
				
			||||||
  --model NousResearch/Hermes-3-Llama-3.1-8B \
 | 
					 | 
				
			||||||
  --dataset grammar \
 | 
					 | 
				
			||||||
  --structure-type grammar \
 | 
					 | 
				
			||||||
  --request-rate 10 \
 | 
					 | 
				
			||||||
  --num-prompts 1000
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### Regex-based Generation Benchmark
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
python3 benchmarks/benchmark_serving_structured_output.py \
 | 
					 | 
				
			||||||
  --backend vllm \
 | 
					 | 
				
			||||||
  --model NousResearch/Hermes-3-Llama-3.1-8B \
 | 
					 | 
				
			||||||
  --dataset regex \
 | 
					 | 
				
			||||||
  --request-rate 10 \
 | 
					 | 
				
			||||||
  --num-prompts 1000
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### Choice-based Generation Benchmark
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
python3 benchmarks/benchmark_serving_structured_output.py \
 | 
					 | 
				
			||||||
  --backend vllm \
 | 
					 | 
				
			||||||
  --model NousResearch/Hermes-3-Llama-3.1-8B \
 | 
					 | 
				
			||||||
  --dataset choice \
 | 
					 | 
				
			||||||
  --request-rate 10 \
 | 
					 | 
				
			||||||
  --num-prompts 1000
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### XGrammar Benchmark Dataset
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
python3 benchmarks/benchmark_serving_structured_output.py \
 | 
					 | 
				
			||||||
  --backend vllm \
 | 
					 | 
				
			||||||
  --model NousResearch/Hermes-3-Llama-3.1-8B \
 | 
					 | 
				
			||||||
  --dataset xgrammar_bench \
 | 
					 | 
				
			||||||
  --request-rate 10 \
 | 
					 | 
				
			||||||
  --num-prompts 1000
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
</details>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
## 📚 Example - Long Document QA Benchmark
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
<details>
 | 
					 | 
				
			||||||
<summary>Show more</summary>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
<br/>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Benchmark the performance of long document question-answering with prefix caching.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### Basic Long Document QA Test
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
python3 benchmarks/benchmark_long_document_qa_throughput.py \
 | 
					 | 
				
			||||||
  --model meta-llama/Llama-2-7b-chat-hf \
 | 
					 | 
				
			||||||
  --enable-prefix-caching \
 | 
					 | 
				
			||||||
  --num-documents 16 \
 | 
					 | 
				
			||||||
  --document-length 2000 \
 | 
					 | 
				
			||||||
  --output-len 50 \
 | 
					 | 
				
			||||||
  --repeat-count 5
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### Different Repeat Modes
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
# Random mode (default) - shuffle prompts randomly
 | 
					 | 
				
			||||||
python3 benchmarks/benchmark_long_document_qa_throughput.py \
 | 
					 | 
				
			||||||
  --model meta-llama/Llama-2-7b-chat-hf \
 | 
					 | 
				
			||||||
  --enable-prefix-caching \
 | 
					 | 
				
			||||||
  --num-documents 8 \
 | 
					 | 
				
			||||||
  --document-length 3000 \
 | 
					 | 
				
			||||||
  --repeat-count 3 \
 | 
					 | 
				
			||||||
  --repeat-mode random
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Tile mode - repeat entire prompt list in sequence
 | 
					 | 
				
			||||||
python3 benchmarks/benchmark_long_document_qa_throughput.py \
 | 
					 | 
				
			||||||
  --model meta-llama/Llama-2-7b-chat-hf \
 | 
					 | 
				
			||||||
  --enable-prefix-caching \
 | 
					 | 
				
			||||||
  --num-documents 8 \
 | 
					 | 
				
			||||||
  --document-length 3000 \
 | 
					 | 
				
			||||||
  --repeat-count 3 \
 | 
					 | 
				
			||||||
  --repeat-mode tile
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
# Interleave mode - repeat each prompt consecutively
 | 
					 | 
				
			||||||
python3 benchmarks/benchmark_long_document_qa_throughput.py \
 | 
					 | 
				
			||||||
  --model meta-llama/Llama-2-7b-chat-hf \
 | 
					 | 
				
			||||||
  --enable-prefix-caching \
 | 
					 | 
				
			||||||
  --num-documents 8 \
 | 
					 | 
				
			||||||
  --document-length 3000 \
 | 
					 | 
				
			||||||
  --repeat-count 3 \
 | 
					 | 
				
			||||||
  --repeat-mode interleave
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
</details>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
## 🗂️ Example - Prefix Caching Benchmark
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
<details>
 | 
					 | 
				
			||||||
<summary>Show more</summary>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
<br/>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Benchmark the efficiency of automatic prefix caching.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### Fixed Prompt with Prefix Caching
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
python3 benchmarks/benchmark_prefix_caching.py \
 | 
					 | 
				
			||||||
  --model meta-llama/Llama-2-7b-chat-hf \
 | 
					 | 
				
			||||||
  --enable-prefix-caching \
 | 
					 | 
				
			||||||
  --num-prompts 1 \
 | 
					 | 
				
			||||||
  --repeat-count 100 \
 | 
					 | 
				
			||||||
  --input-length-range 128:256
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### ShareGPT Dataset with Prefix Caching
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
# download dataset
 | 
					 | 
				
			||||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
python3 benchmarks/benchmark_prefix_caching.py \
 | 
					 | 
				
			||||||
  --model meta-llama/Llama-2-7b-chat-hf \
 | 
					 | 
				
			||||||
  --dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
 | 
					 | 
				
			||||||
  --enable-prefix-caching \
 | 
					 | 
				
			||||||
  --num-prompts 20 \
 | 
					 | 
				
			||||||
  --repeat-count 5 \
 | 
					 | 
				
			||||||
  --input-length-range 128:256
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### Prefix Repetition Dataset
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
vllm bench serve \
 | 
					 | 
				
			||||||
  --backend openai \
 | 
					 | 
				
			||||||
  --model meta-llama/Llama-2-7b-chat-hf \
 | 
					 | 
				
			||||||
  --dataset-name prefix_repetition \
 | 
					 | 
				
			||||||
  --num-prompts 100 \
 | 
					 | 
				
			||||||
  --prefix-repetition-prefix-len 512 \
 | 
					 | 
				
			||||||
  --prefix-repetition-suffix-len 128 \
 | 
					 | 
				
			||||||
  --prefix-repetition-num-prefixes 5 \
 | 
					 | 
				
			||||||
  --prefix-repetition-output-len 128
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
</details>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
## ⚡ Example - Request Prioritization Benchmark
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
<details>
 | 
					 | 
				
			||||||
<summary>Show more</summary>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
<br/>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Benchmark the performance of request prioritization in vLLM.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### Basic Prioritization Test
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
python3 benchmarks/benchmark_prioritization.py \
 | 
					 | 
				
			||||||
  --model meta-llama/Llama-2-7b-chat-hf \
 | 
					 | 
				
			||||||
  --input-len 128 \
 | 
					 | 
				
			||||||
  --output-len 64 \
 | 
					 | 
				
			||||||
  --num-prompts 100 \
 | 
					 | 
				
			||||||
  --scheduling-policy priority
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### Multiple Sequences per Prompt
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
python3 benchmarks/benchmark_prioritization.py \
 | 
					 | 
				
			||||||
  --model meta-llama/Llama-2-7b-chat-hf \
 | 
					 | 
				
			||||||
  --input-len 128 \
 | 
					 | 
				
			||||||
  --output-len 64 \
 | 
					 | 
				
			||||||
  --num-prompts 100 \
 | 
					 | 
				
			||||||
  --scheduling-policy priority \
 | 
					 | 
				
			||||||
  --n 2
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
</details>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
## 👁️ Example - Multi-Modal Benchmark
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
<details>
 | 
					 | 
				
			||||||
<summary>Show more</summary>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
<br/>
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Benchmark the performance of multi-modal requests in vLLM.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### Images (ShareGPT4V)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Start vLLM:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
python -m vllm.entrypoints.openai.api_server \
 | 
					 | 
				
			||||||
  --model Qwen/Qwen2.5-VL-7B-Instruct \
 | 
					 | 
				
			||||||
  --dtype bfloat16 \
 | 
					 | 
				
			||||||
  --limit-mm-per-prompt '{"image": 1}' \
 | 
					 | 
				
			||||||
  --allowed-local-media-path /path/to/sharegpt4v/images
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Send requests with images:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
vllm bench serve \
 | 
					 | 
				
			||||||
  --backend openai-chat \
 | 
					 | 
				
			||||||
  --model Qwen/Qwen2.5-VL-7B-Instruct \
 | 
					 | 
				
			||||||
  --dataset-name sharegpt \
 | 
					 | 
				
			||||||
  --dataset-path /path/to/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.json \
 | 
					 | 
				
			||||||
  --num-prompts 100 \
 | 
					 | 
				
			||||||
  --save-result \
 | 
					 | 
				
			||||||
  --result-dir ~/vllm_benchmark_results \
 | 
					 | 
				
			||||||
  --save-detailed \
 | 
					 | 
				
			||||||
  --endpoint /v1/chat/completion
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### Videos (ShareGPT4Video)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Start vLLM:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
python -m vllm.entrypoints.openai.api_server \
 | 
					 | 
				
			||||||
  --model Qwen/Qwen2.5-VL-7B-Instruct \
 | 
					 | 
				
			||||||
  --dtype bfloat16 \
 | 
					 | 
				
			||||||
  --limit-mm-per-prompt '{"video": 1}' \
 | 
					 | 
				
			||||||
  --allowed-local-media-path /path/to/sharegpt4video/videos
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Send requests with videos:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
vllm bench serve \
 | 
					 | 
				
			||||||
  --backend openai-chat \
 | 
					 | 
				
			||||||
  --model Qwen/Qwen2.5-VL-7B-Instruct \
 | 
					 | 
				
			||||||
  --dataset-name sharegpt \
 | 
					 | 
				
			||||||
  --dataset-path /path/to/ShareGPT4Video/llava_v1_5_mix665k_with_video_chatgpt72k_share4video28k.json \
 | 
					 | 
				
			||||||
  --num-prompts 100 \
 | 
					 | 
				
			||||||
  --save-result \
 | 
					 | 
				
			||||||
  --result-dir ~/vllm_benchmark_results \
 | 
					 | 
				
			||||||
  --save-detailed \
 | 
					 | 
				
			||||||
  --endpoint /v1/chat/completion
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
### Synthetic Random Images (random-mm)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Generate synthetic image inputs alongside random text prompts to stress-test vision models without external datasets.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Notes:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
- Works only with online benchmark via the OpenAI  backend (`--backend openai-chat`) and endpoint `/v1/chat/completions`.
 | 
					 | 
				
			||||||
- Video sampling is not yet implemented.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Start the server (example):
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
vllm serve Qwen/Qwen2.5-VL-3B-Instruct \
 | 
					 | 
				
			||||||
  --dtype bfloat16 \
 | 
					 | 
				
			||||||
  --max-model-len 16384 \
 | 
					 | 
				
			||||||
  --limit-mm-per-prompt '{"image": 3, "video": 0}' \
 | 
					 | 
				
			||||||
  --mm-processor-kwargs max_pixels=1003520
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Benchmark. It is recommended to use the flag `--ignore-eos` to simulate real responses. You can set the size of the output via the arg `random-output-len`.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Ex.1: Fixed number of items and a single image resolution, enforcing generation of approx 40 tokens:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
vllm bench serve \
 | 
					 | 
				
			||||||
  --backend openai-chat \
 | 
					 | 
				
			||||||
  --model Qwen/Qwen2.5-VL-3B-Instruct \
 | 
					 | 
				
			||||||
  --endpoint /v1/chat/completions \
 | 
					 | 
				
			||||||
  --dataset-name random-mm \
 | 
					 | 
				
			||||||
  --num-prompts 100 \
 | 
					 | 
				
			||||||
  --max-concurrency 10 \
 | 
					 | 
				
			||||||
  --random-prefix-len 25 \
 | 
					 | 
				
			||||||
  --random-input-len 300 \
 | 
					 | 
				
			||||||
  --random-output-len 40 \
 | 
					 | 
				
			||||||
  --random-range-ratio 0.2 \
 | 
					 | 
				
			||||||
  --random-mm-base-items-per-request 2 \
 | 
					 | 
				
			||||||
  --random-mm-limit-mm-per-prompt '{"image": 3, "video": 0}' \
 | 
					 | 
				
			||||||
  --random-mm-bucket-config '{(224, 224, 1): 1.0}' \
 | 
					 | 
				
			||||||
  --request-rate inf \
 | 
					 | 
				
			||||||
  --ignore-eos \
 | 
					 | 
				
			||||||
  --seed 42
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
The number of items per request can be controlled by passing multiple image buckets:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
```bash
 | 
					 | 
				
			||||||
  --random-mm-base-items-per-request 2 \
 | 
					 | 
				
			||||||
  --random-mm-num-mm-items-range-ratio 0.5 \
 | 
					 | 
				
			||||||
  --random-mm-limit-mm-per-prompt '{"image": 4, "video": 0}' \
 | 
					 | 
				
			||||||
  --random-mm-bucket-config '{(256, 256, 1): 0.7, (720, 1280, 1): 0.3}' \
 | 
					 | 
				
			||||||
```
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Flags specific to `random-mm`:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
- `--random-mm-base-items-per-request`: base number of multimodal items per request.
 | 
					 | 
				
			||||||
- `--random-mm-num-mm-items-range-ratio`: vary item count uniformly in the closed integer range [floor(n·(1−r)), ceil(n·(1+r))]. Set r=0 to keep it fixed; r=1 allows 0 items.
 | 
					 | 
				
			||||||
- `--random-mm-limit-mm-per-prompt`: per-modality hard caps, e.g. '{"image": 3, "video": 0}'.
 | 
					 | 
				
			||||||
- `--random-mm-bucket-config`: dict mapping (H, W, T) → probability. Entries with probability 0 are removed; remaining probabilities are renormalized to sum to 1. Use T=1 for images. Set any T>1 for videos (video sampling not yet supported).
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
Behavioral notes:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
- If the requested base item count cannot be satisfied under the provided per-prompt limits, the tool raises an error rather than silently clamping.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
How sampling works:
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
- Determine per-request item count k by sampling uniformly from the integer range defined by `--random-mm-base-items-per-request` and `--random-mm-num-mm-items-range-ratio`, then clamp k to at most the sum of per-modality limits.
 | 
					 | 
				
			||||||
- For each of the k items, sample a bucket (H, W, T) according to the normalized probabilities in `--random-mm-bucket-config`, while tracking how many items of each modality have been added.
 | 
					 | 
				
			||||||
- If a modality (e.g., image) reaches its limit from `--random-mm-limit-mm-per-prompt`, all buckets of that modality are excluded and the remaining bucket probabilities are renormalized before continuing.
 | 
					 | 
				
			||||||
This should be seen as an edge case, and if this behavior can be avoided by setting `--random-mm-limit-mm-per-prompt` to a large number. Note that this might result in errors due to engine config `--limit-mm-per-prompt`.
 | 
					 | 
				
			||||||
- The resulting request contains synthetic image data in `multi_modal_data` (OpenAI Chat format). When `random-mm` is used with the OpenAI Chat backend, prompts remain text and MM content is attached via `multi_modal_data`.
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
</details>
 | 
					 | 
				
			||||||
 | 
				
			|||||||
@ -149,3 +149,70 @@ The script follows a systematic process to find the optimal parameters:
 | 
				
			|||||||
4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far.
 | 
					4. **Track Best Result**: Throughout the process, the script tracks the parameter combination that has yielded the highest valid throughput so far.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard.
 | 
					5. **Profile Collection**: For the best-performing run, the script saves the vLLM profiler output, which can be used for deep-dive performance analysis with tools like TensorBoard.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					## Batched `auto_tune`
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The `batch_auto_tune.sh` script allows you to run multiple `auto_tune.sh` experiments sequentially from a single configuration file. It iterates through a list of parameter sets, executes `auto_tune.sh` for each, and records the results back into the input file.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Prerequisites
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- **jq**: This script requires `jq` to parse the JSON configuration file.
 | 
				
			||||||
 | 
					- **gcloud**: If you plan to upload results to Google Cloud Storage, the `gcloud` CLI must be installed and authenticated.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### How to Run
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					1. **Create a JSON configuration file**: Create a file (e.g., `runs_config.json`) containing an array of JSON objects. Each object defines the parameters for a single `auto_tune.sh` run.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					2. **Execute the script**:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    ```bash
 | 
				
			||||||
 | 
					    bash batch_auto_tune.sh <path_to_json_file> [gcs_upload_path]
 | 
				
			||||||
 | 
					    ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    - `<path_to_json_file>`: **Required.** Path to your JSON configuration file.
 | 
				
			||||||
 | 
					    - `[gcs_upload_path]`: **Optional.** A GCS path (e.g., `gs://my-bucket/benchmark-results`) where the detailed results and profiles for each run will be uploaded. If this is empty, the results will be available on the local filesystem (see the log for `RESULT_FILE=/path/to/results/file.txt`).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Configuration File
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The JSON configuration file should contain an array of objects. Each object's keys correspond to the configuration variables for `auto_tune.sh` (see the [Configuration table above](#configuration)). These keys will be converted to uppercase environment variables for each run.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Here is an example `runs_config.json` with two benchmark configurations:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```json
 | 
				
			||||||
 | 
					[
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    "base": "/home/user",
 | 
				
			||||||
 | 
					    "model": "meta-llama/Llama-3.1-8B-Instruct",
 | 
				
			||||||
 | 
					    "system": "TPU", # OR GPU
 | 
				
			||||||
 | 
					    "tp": 8,
 | 
				
			||||||
 | 
					    "input_len": 128,
 | 
				
			||||||
 | 
					    "output_len": 2048,
 | 
				
			||||||
 | 
					    "max_model_len": 2300,
 | 
				
			||||||
 | 
					    "num_seqs_list": "128 256",
 | 
				
			||||||
 | 
					    "num_batched_tokens_list": "8192 16384"
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					    "base": "/home/user",
 | 
				
			||||||
 | 
					    "model": "meta-llama/Llama-3.1-70B-Instruct",
 | 
				
			||||||
 | 
					    "system": "TPU", # OR GPU
 | 
				
			||||||
 | 
					    "tp": 8,
 | 
				
			||||||
 | 
					    "input_len": 4000,
 | 
				
			||||||
 | 
					    "output_len": 16,
 | 
				
			||||||
 | 
					    "max_model_len": 4096,
 | 
				
			||||||
 | 
					    "num_seqs_list": "64 128",
 | 
				
			||||||
 | 
					    "num_batched_tokens_list": "4096 8192",
 | 
				
			||||||
 | 
					    "max_latency_allowed_ms": 500
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Output
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The script modifies the input JSON file in place, adding the results of each run to the corresponding object. The following fields are added:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					- `run_id`: A unique identifier for the run, derived from the timestamp.
 | 
				
			||||||
 | 
					- `status`: The outcome of the run (`SUCCESS`, `FAILURE`, or `WARNING_NO_RESULT_FILE`).
 | 
				
			||||||
 | 
					- `results`: The content of the `result.txt` file from the `auto_tune.sh` run.
 | 
				
			||||||
 | 
					- `gcs_results`: The GCS URL where the run's artifacts are stored (if a GCS path was provided).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					A summary of successful and failed runs is also printed to the console upon completion.
 | 
				
			||||||
 | 
				
			|||||||
@ -74,7 +74,7 @@ start_server() {
 | 
				
			|||||||
    local vllm_log=$4
 | 
					    local vllm_log=$4
 | 
				
			||||||
    local profile_dir=$5
 | 
					    local profile_dir=$5
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    pkill -if vllm
 | 
					    pkill -if "vllm serve" || true
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Define the common arguments as a bash array.
 | 
					    # Define the common arguments as a bash array.
 | 
				
			||||||
    # Each argument and its value are separate elements.
 | 
					    # Each argument and its value are separate elements.
 | 
				
			||||||
@ -96,17 +96,22 @@ start_server() {
 | 
				
			|||||||
    # This correctly passes each element as a separate argument.
 | 
					    # This correctly passes each element as a separate argument.
 | 
				
			||||||
    if [[ -n "$profile_dir" ]]; then
 | 
					    if [[ -n "$profile_dir" ]]; then
 | 
				
			||||||
        # Start server with profiling enabled
 | 
					        # Start server with profiling enabled
 | 
				
			||||||
        VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
 | 
					        VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
 | 
				
			||||||
            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
 | 
					            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
 | 
				
			||||||
    else
 | 
					    else
 | 
				
			||||||
        # Start server without profiling
 | 
					        # Start server without profiling
 | 
				
			||||||
        VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
 | 
					        VLLM_SERVER_DEV_MODE=1 \
 | 
				
			||||||
            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
 | 
					            vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
 | 
				
			||||||
    fi
 | 
					    fi
 | 
				
			||||||
 | 
					    local server_pid=$!
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # wait for 10 minutes...
 | 
					    # wait for 10 minutes...
 | 
				
			||||||
    server_started=0
 | 
					    server_started=0
 | 
				
			||||||
    for i in {1..60}; do
 | 
					    for i in {1..60}; do
 | 
				
			||||||
 | 
					        # This line checks whether the server is still alive or not,
 | 
				
			||||||
 | 
					        # since that we should always have permission to send signal to the server process.
 | 
				
			||||||
 | 
					        kill -0 $server_pid 2> /dev/null || break
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
 | 
					        RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
 | 
				
			||||||
        STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
 | 
					        STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
 | 
				
			||||||
        if [[ "$STATUS_CODE" -eq 200 ]]; then
 | 
					        if [[ "$STATUS_CODE" -eq 200 ]]; then
 | 
				
			||||||
@ -118,7 +123,7 @@ start_server() {
 | 
				
			|||||||
    done
 | 
					    done
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if (( ! server_started )); then
 | 
					    if (( ! server_started )); then
 | 
				
			||||||
        echo "server did not start within 10 minutes. Please check server log at $vllm_log".
 | 
					        echo "server did not start within 10 minutes or crashed. Please check server log at $vllm_log".
 | 
				
			||||||
        return 1
 | 
					        return 1
 | 
				
			||||||
    else
 | 
					    else
 | 
				
			||||||
        return 0
 | 
					        return 0
 | 
				
			||||||
@ -134,7 +139,7 @@ run_benchmark() {
 | 
				
			|||||||
    echo "vllm_log: $vllm_log"
 | 
					    echo "vllm_log: $vllm_log"
 | 
				
			||||||
    echo
 | 
					    echo
 | 
				
			||||||
    rm -f $vllm_log
 | 
					    rm -f $vllm_log
 | 
				
			||||||
    pkill -if vllm
 | 
					    pkill -if "vllm serve" || true
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    echo "starting server..."
 | 
					    echo "starting server..."
 | 
				
			||||||
    # Call start_server without a profile_dir to avoid profiling overhead
 | 
					    # Call start_server without a profile_dir to avoid profiling overhead
 | 
				
			||||||
@ -227,7 +232,7 @@ run_benchmark() {
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
 | 
					    echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    pkill -if vllm
 | 
					    pkill -if "vllm serve" || true
 | 
				
			||||||
    sleep 10
 | 
					    sleep 10
 | 
				
			||||||
    echo "===================="
 | 
					    echo "===================="
 | 
				
			||||||
    return 0
 | 
					    return 0
 | 
				
			||||||
@ -303,6 +308,6 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then
 | 
				
			|||||||
else
 | 
					else
 | 
				
			||||||
    echo "No configuration met the latency requirements. Skipping final profiling run."
 | 
					    echo "No configuration met the latency requirements. Skipping final profiling run."
 | 
				
			||||||
fi
 | 
					fi
 | 
				
			||||||
pkill -if vllm
 | 
					pkill -if "vllm serve" || true
 | 
				
			||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
 | 
					echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
 | 
				
			||||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
 | 
					echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										128
									
								
								benchmarks/auto_tune/batch_auto_tune.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										128
									
								
								benchmarks/auto_tune/batch_auto_tune.sh
									
									
									
									
									
										Executable file
									
								
							@ -0,0 +1,128 @@
 | 
				
			|||||||
 | 
					#!/bin/bash
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					INPUT_JSON="$1"
 | 
				
			||||||
 | 
					GCS_PATH="$2" # Optional GCS path for uploading results for each run
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
 | 
				
			||||||
 | 
					AUTOTUNE_SCRIPT="$SCRIPT_DIR/auto_tune.sh"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if [[ -z "$INPUT_JSON" ]]; then
 | 
				
			||||||
 | 
					  echo "Error: Input JSON file not provided."
 | 
				
			||||||
 | 
					  echo "Usage: $0 <path_to_json_file> [gcs_upload_path]"
 | 
				
			||||||
 | 
					  exit 1
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if [[ ! -f "$INPUT_JSON" ]]; then
 | 
				
			||||||
 | 
					  echo "Error: File not found at '$INPUT_JSON'"
 | 
				
			||||||
 | 
					  exit 1
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if ! command -v jq &> /dev/null; then
 | 
				
			||||||
 | 
					    echo "Error: 'jq' command not found. Please install jq to process the JSON input."
 | 
				
			||||||
 | 
					    exit 1
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if [[ -n "$GCS_PATH" ]] && ! command -v gcloud &> /dev/null; then
 | 
				
			||||||
 | 
					    echo "Error: 'gcloud' command not found, but a GCS_PATH was provided."
 | 
				
			||||||
 | 
					    exit 1
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					SUCCESS_COUNT=0
 | 
				
			||||||
 | 
					FAILURE_COUNT=0
 | 
				
			||||||
 | 
					FAILED_RUNS=()
 | 
				
			||||||
 | 
					SCRIPT_START_TIME=$(date +%s)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					json_content=$(cat "$INPUT_JSON")
 | 
				
			||||||
 | 
					if ! num_runs=$(echo "$json_content" | jq 'length'); then
 | 
				
			||||||
 | 
					  echo "Error: Invalid JSON in $INPUT_JSON. 'jq' failed to get array length." >&2
 | 
				
			||||||
 | 
					  exit 1
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					echo "Found $num_runs benchmark configurations in $INPUT_JSON."
 | 
				
			||||||
 | 
					echo "Starting benchmark runs..."
 | 
				
			||||||
 | 
					echo "--------------------------------------------------"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					for i in $(seq 0 $(($num_runs - 1))); do
 | 
				
			||||||
 | 
					  run_object=$(echo "$json_content" | jq ".[$i]")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  RUN_START_TIME=$(date +%s)
 | 
				
			||||||
 | 
					  ENV_VARS_ARRAY=()
 | 
				
			||||||
 | 
					  # Dynamically create env vars from the JSON object's keys
 | 
				
			||||||
 | 
					  for key in $(echo "$run_object" | jq -r 'keys_unsorted[]'); do
 | 
				
			||||||
 | 
					    value=$(echo "$run_object" | jq -r ".$key")
 | 
				
			||||||
 | 
					    var_name=$(echo "$key" | tr '[:lower:]' '[:upper:]' | tr -cd 'A-Z0-9_')
 | 
				
			||||||
 | 
					    ENV_VARS_ARRAY+=("${var_name}=${value}")
 | 
				
			||||||
 | 
					  done
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  echo "Executing run #$((i+1))/$num_runs with parameters: ${ENV_VARS_ARRAY[*]}"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  # Execute auto_tune.sh and capture output
 | 
				
			||||||
 | 
					  RUN_OUTPUT_FILE=$(mktemp)
 | 
				
			||||||
 | 
					  if env "${ENV_VARS_ARRAY[@]}" bash "$AUTOTUNE_SCRIPT" > >(tee -a "$RUN_OUTPUT_FILE") 2>&1; then
 | 
				
			||||||
 | 
					    STATUS="SUCCESS"
 | 
				
			||||||
 | 
					    ((SUCCESS_COUNT++))
 | 
				
			||||||
 | 
					  else
 | 
				
			||||||
 | 
					    STATUS="FAILURE"
 | 
				
			||||||
 | 
					    ((FAILURE_COUNT++))
 | 
				
			||||||
 | 
					    FAILED_RUNS+=("Run #$((i+1)): $(echo $run_object | jq -c .)")
 | 
				
			||||||
 | 
					  fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  RUN_OUTPUT=$(<"$RUN_OUTPUT_FILE")
 | 
				
			||||||
 | 
					  rm "$RUN_OUTPUT_FILE"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  # Parse results and optionally upload them to GCS
 | 
				
			||||||
 | 
					  RUN_ID=""
 | 
				
			||||||
 | 
					  RESULTS=""
 | 
				
			||||||
 | 
					  GCS_RESULTS_URL=""
 | 
				
			||||||
 | 
					  if [[ "$STATUS" == "SUCCESS" ]]; then
 | 
				
			||||||
 | 
					    RESULT_FILE_PATH=$(echo "$RUN_OUTPUT" | grep 'RESULT_FILE=' | tail -n 1 | cut -d'=' -f2 | tr -s '/' || true)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if [[ -n "$RESULT_FILE_PATH" && -f "$RESULT_FILE_PATH" ]]; then
 | 
				
			||||||
 | 
					      RUN_ID=$(basename "$(dirname "$RESULT_FILE_PATH")")
 | 
				
			||||||
 | 
					      RESULT_DIR=$(dirname "$RESULT_FILE_PATH")
 | 
				
			||||||
 | 
					      RESULTS=$(cat "$RESULT_FILE_PATH")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      if [[ -n "$GCS_PATH" ]]; then
 | 
				
			||||||
 | 
					        GCS_RESULTS_URL="${GCS_PATH}/${RUN_ID}"
 | 
				
			||||||
 | 
					        echo "Uploading results to GCS..."
 | 
				
			||||||
 | 
					        if gcloud storage rsync --recursive "$RESULT_DIR/" "$GCS_RESULTS_URL"; then
 | 
				
			||||||
 | 
					          echo "GCS upload successful."
 | 
				
			||||||
 | 
					        else
 | 
				
			||||||
 | 
					          echo "Warning: GCS upload failed for RUN_ID $RUN_ID."
 | 
				
			||||||
 | 
					        fi
 | 
				
			||||||
 | 
					      fi
 | 
				
			||||||
 | 
					    else
 | 
				
			||||||
 | 
					      echo "Warning: Could not find result file for a successful run."
 | 
				
			||||||
 | 
					      STATUS="WARNING_NO_RESULT_FILE"
 | 
				
			||||||
 | 
					    fi
 | 
				
			||||||
 | 
					  fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  # Add the results back into the JSON object for this run
 | 
				
			||||||
 | 
					  json_content=$(echo "$json_content" | jq --argjson i "$i" --arg run_id "$RUN_ID" --arg status "$STATUS" --arg results "$RESULTS" --arg gcs_results "$GCS_RESULTS_URL" \
 | 
				
			||||||
 | 
					    '.[$i] += {run_id: $run_id, status: $status, results: $results, gcs_results: $gcs_results}')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  RUN_END_TIME=$(date +%s)
 | 
				
			||||||
 | 
					  echo "Run finished in $((RUN_END_TIME - RUN_START_TIME)) seconds. Status: $STATUS"
 | 
				
			||||||
 | 
					  echo "--------------------------------------------------"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  # Save intermediate progress back to the file
 | 
				
			||||||
 | 
					  echo "$json_content" > "$INPUT_JSON.tmp" && mv "$INPUT_JSON.tmp" "$INPUT_JSON"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					done
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					SCRIPT_END_TIME=$(date +%s)
 | 
				
			||||||
 | 
					echo "All benchmark runs completed in $((SCRIPT_END_TIME - SCRIPT_START_TIME)) seconds."
 | 
				
			||||||
 | 
					echo
 | 
				
			||||||
 | 
					echo "====================== SUMMARY ======================"
 | 
				
			||||||
 | 
					echo "Successful runs: $SUCCESS_COUNT"
 | 
				
			||||||
 | 
					echo "Failed runs:     $FAILURE_COUNT"
 | 
				
			||||||
 | 
					echo "==================================================="
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if [[ $FAILURE_COUNT -gt 0 ]]; then
 | 
				
			||||||
 | 
					  echo "Details of failed runs (see JSON file for full parameters):"
 | 
				
			||||||
 | 
					  for failed in "${FAILED_RUNS[@]}"; do
 | 
				
			||||||
 | 
					    echo "  - $failed"
 | 
				
			||||||
 | 
					  done
 | 
				
			||||||
 | 
					fi
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					echo "Updated results have been saved to '$INPUT_JSON'."
 | 
				
			||||||
@ -8,7 +8,6 @@ import sys
 | 
				
			|||||||
import time
 | 
					import time
 | 
				
			||||||
import traceback
 | 
					import traceback
 | 
				
			||||||
from dataclasses import dataclass, field
 | 
					from dataclasses import dataclass, field
 | 
				
			||||||
from typing import Optional, Union
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
import aiohttp
 | 
					import aiohttp
 | 
				
			||||||
import huggingface_hub.constants
 | 
					import huggingface_hub.constants
 | 
				
			||||||
@ -28,13 +27,13 @@ class RequestFuncInput:
 | 
				
			|||||||
    prompt_len: int
 | 
					    prompt_len: int
 | 
				
			||||||
    output_len: int
 | 
					    output_len: int
 | 
				
			||||||
    model: str
 | 
					    model: str
 | 
				
			||||||
    model_name: Optional[str] = None
 | 
					    model_name: str | None = None
 | 
				
			||||||
    logprobs: Optional[int] = None
 | 
					    logprobs: int | None = None
 | 
				
			||||||
    extra_body: Optional[dict] = None
 | 
					    extra_body: dict | None = None
 | 
				
			||||||
    multi_modal_content: Optional[dict | list[dict]] = None
 | 
					    multi_modal_content: dict | list[dict] | None = None
 | 
				
			||||||
    ignore_eos: bool = False
 | 
					    ignore_eos: bool = False
 | 
				
			||||||
    language: Optional[str] = None
 | 
					    language: str | None = None
 | 
				
			||||||
    request_id: Optional[str] = None
 | 
					    request_id: str | None = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@dataclass
 | 
					@dataclass
 | 
				
			||||||
@ -52,7 +51,7 @@ class RequestFuncOutput:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
async def async_request_tgi(
 | 
					async def async_request_tgi(
 | 
				
			||||||
    request_func_input: RequestFuncInput,
 | 
					    request_func_input: RequestFuncInput,
 | 
				
			||||||
    pbar: Optional[tqdm] = None,
 | 
					    pbar: tqdm | None = None,
 | 
				
			||||||
) -> RequestFuncOutput:
 | 
					) -> RequestFuncOutput:
 | 
				
			||||||
    api_url = request_func_input.api_url
 | 
					    api_url = request_func_input.api_url
 | 
				
			||||||
    assert api_url.endswith("generate_stream")
 | 
					    assert api_url.endswith("generate_stream")
 | 
				
			||||||
@ -133,7 +132,7 @@ async def async_request_tgi(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
async def async_request_trt_llm(
 | 
					async def async_request_trt_llm(
 | 
				
			||||||
    request_func_input: RequestFuncInput,
 | 
					    request_func_input: RequestFuncInput,
 | 
				
			||||||
    pbar: Optional[tqdm] = None,
 | 
					    pbar: tqdm | None = None,
 | 
				
			||||||
) -> RequestFuncOutput:
 | 
					) -> RequestFuncOutput:
 | 
				
			||||||
    api_url = request_func_input.api_url
 | 
					    api_url = request_func_input.api_url
 | 
				
			||||||
    assert api_url.endswith("generate_stream")
 | 
					    assert api_url.endswith("generate_stream")
 | 
				
			||||||
@ -204,7 +203,7 @@ async def async_request_trt_llm(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
async def async_request_deepspeed_mii(
 | 
					async def async_request_deepspeed_mii(
 | 
				
			||||||
    request_func_input: RequestFuncInput,
 | 
					    request_func_input: RequestFuncInput,
 | 
				
			||||||
    pbar: Optional[tqdm] = None,
 | 
					    pbar: tqdm | None = None,
 | 
				
			||||||
) -> RequestFuncOutput:
 | 
					) -> RequestFuncOutput:
 | 
				
			||||||
    api_url = request_func_input.api_url
 | 
					    api_url = request_func_input.api_url
 | 
				
			||||||
    assert api_url.endswith(("completions", "profile")), (
 | 
					    assert api_url.endswith(("completions", "profile")), (
 | 
				
			||||||
@ -267,7 +266,7 @@ async def async_request_deepspeed_mii(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
async def async_request_openai_completions(
 | 
					async def async_request_openai_completions(
 | 
				
			||||||
    request_func_input: RequestFuncInput,
 | 
					    request_func_input: RequestFuncInput,
 | 
				
			||||||
    pbar: Optional[tqdm] = None,
 | 
					    pbar: tqdm | None = None,
 | 
				
			||||||
) -> RequestFuncOutput:
 | 
					) -> RequestFuncOutput:
 | 
				
			||||||
    api_url = request_func_input.api_url
 | 
					    api_url = request_func_input.api_url
 | 
				
			||||||
    assert api_url.endswith(("completions", "profile")), (
 | 
					    assert api_url.endswith(("completions", "profile")), (
 | 
				
			||||||
@ -367,7 +366,7 @@ async def async_request_openai_completions(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
async def async_request_openai_chat_completions(
 | 
					async def async_request_openai_chat_completions(
 | 
				
			||||||
    request_func_input: RequestFuncInput,
 | 
					    request_func_input: RequestFuncInput,
 | 
				
			||||||
    pbar: Optional[tqdm] = None,
 | 
					    pbar: tqdm | None = None,
 | 
				
			||||||
) -> RequestFuncOutput:
 | 
					) -> RequestFuncOutput:
 | 
				
			||||||
    api_url = request_func_input.api_url
 | 
					    api_url = request_func_input.api_url
 | 
				
			||||||
    assert api_url.endswith(("chat/completions", "profile")), (
 | 
					    assert api_url.endswith(("chat/completions", "profile")), (
 | 
				
			||||||
@ -476,7 +475,7 @@ async def async_request_openai_chat_completions(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
async def async_request_openai_audio(
 | 
					async def async_request_openai_audio(
 | 
				
			||||||
    request_func_input: RequestFuncInput,
 | 
					    request_func_input: RequestFuncInput,
 | 
				
			||||||
    pbar: Optional[tqdm] = None,
 | 
					    pbar: tqdm | None = None,
 | 
				
			||||||
) -> RequestFuncOutput:
 | 
					) -> RequestFuncOutput:
 | 
				
			||||||
    # Lazy import without PlaceholderModule to avoid vllm dep.
 | 
					    # Lazy import without PlaceholderModule to avoid vllm dep.
 | 
				
			||||||
    import soundfile
 | 
					    import soundfile
 | 
				
			||||||
@ -610,7 +609,7 @@ def get_tokenizer(
 | 
				
			|||||||
    tokenizer_mode: str = "auto",
 | 
					    tokenizer_mode: str = "auto",
 | 
				
			||||||
    trust_remote_code: bool = False,
 | 
					    trust_remote_code: bool = False,
 | 
				
			||||||
    **kwargs,
 | 
					    **kwargs,
 | 
				
			||||||
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
 | 
					) -> PreTrainedTokenizer | PreTrainedTokenizerFast:
 | 
				
			||||||
    if pretrained_model_name_or_path is not None and not os.path.exists(
 | 
					    if pretrained_model_name_or_path is not None and not os.path.exists(
 | 
				
			||||||
        pretrained_model_name_or_path
 | 
					        pretrained_model_name_or_path
 | 
				
			||||||
    ):
 | 
					    ):
 | 
				
			||||||
 | 
				
			|||||||
@ -2,9 +2,9 @@
 | 
				
			|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
					# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
				
			||||||
import gc
 | 
					import gc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from benchmark_utils import TimeCollector
 | 
				
			||||||
from tabulate import tabulate
 | 
					from tabulate import tabulate
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from benchmark_utils import TimeCollector
 | 
					 | 
				
			||||||
from vllm.utils import FlexibleArgumentParser
 | 
					from vllm.utils import FlexibleArgumentParser
 | 
				
			||||||
from vllm.v1.core.block_pool import BlockPool
 | 
					from vllm.v1.core.block_pool import BlockPool
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@ -1,17 +1,31 @@
 | 
				
			|||||||
# SPDX-License-Identifier: Apache-2.0
 | 
					# SPDX-License-Identifier: Apache-2.0
 | 
				
			||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
					# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
				
			||||||
import gc
 | 
					import gc
 | 
				
			||||||
 | 
					import time
 | 
				
			||||||
 | 
					from unittest import mock
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import numpy as np
 | 
					import numpy as np
 | 
				
			||||||
 | 
					from benchmark_utils import TimeCollector
 | 
				
			||||||
from tabulate import tabulate
 | 
					from tabulate import tabulate
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from benchmark_utils import TimeCollector
 | 
					from vllm.config import (
 | 
				
			||||||
from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
 | 
					    CacheConfig,
 | 
				
			||||||
 | 
					    DeviceConfig,
 | 
				
			||||||
 | 
					    LoadConfig,
 | 
				
			||||||
 | 
					    ModelConfig,
 | 
				
			||||||
 | 
					    ParallelConfig,
 | 
				
			||||||
 | 
					    SchedulerConfig,
 | 
				
			||||||
 | 
					    SpeculativeConfig,
 | 
				
			||||||
 | 
					    VllmConfig,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					from vllm.platforms import current_platform
 | 
				
			||||||
from vllm.utils import FlexibleArgumentParser
 | 
					from vllm.utils import FlexibleArgumentParser
 | 
				
			||||||
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 | 
					from vllm.v1.spec_decode.ngram_proposer import NgramProposer
 | 
				
			||||||
 | 
					from vllm.v1.worker.gpu_input_batch import InputBatch
 | 
				
			||||||
 | 
					from vllm.v1.worker.gpu_model_runner import GPUModelRunner
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def main(args):
 | 
					def benchmark_propose(args):
 | 
				
			||||||
    rows = []
 | 
					    rows = []
 | 
				
			||||||
    for max_ngram in args.max_ngram:
 | 
					    for max_ngram in args.max_ngram:
 | 
				
			||||||
        collector = TimeCollector(TimeCollector.US)
 | 
					        collector = TimeCollector(TimeCollector.US)
 | 
				
			||||||
@ -69,10 +83,88 @@ def main(args):
 | 
				
			|||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def benchmark_batched_propose(args):
 | 
				
			||||||
 | 
					    NUM_SPECULATIVE_TOKENS_NGRAM = 10
 | 
				
			||||||
 | 
					    PROMPT_LOOKUP_MIN = 5
 | 
				
			||||||
 | 
					    PROMPT_LOOKUP_MAX = 15
 | 
				
			||||||
 | 
					    MAX_MODEL_LEN = int(1e7)
 | 
				
			||||||
 | 
					    DEVICE = current_platform.device_type
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    model_config = ModelConfig(model="facebook/opt-125m", runner="generate")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    speculative_config = SpeculativeConfig(
 | 
				
			||||||
 | 
					        target_model_config=model_config,
 | 
				
			||||||
 | 
					        target_parallel_config=ParallelConfig(),
 | 
				
			||||||
 | 
					        method="ngram",
 | 
				
			||||||
 | 
					        num_speculative_tokens=NUM_SPECULATIVE_TOKENS_NGRAM,
 | 
				
			||||||
 | 
					        prompt_lookup_max=PROMPT_LOOKUP_MAX,
 | 
				
			||||||
 | 
					        prompt_lookup_min=PROMPT_LOOKUP_MIN,
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    vllm_config = VllmConfig(
 | 
				
			||||||
 | 
					        model_config=model_config,
 | 
				
			||||||
 | 
					        cache_config=CacheConfig(),
 | 
				
			||||||
 | 
					        speculative_config=speculative_config,
 | 
				
			||||||
 | 
					        device_config=DeviceConfig(device=current_platform.device_type),
 | 
				
			||||||
 | 
					        parallel_config=ParallelConfig(),
 | 
				
			||||||
 | 
					        load_config=LoadConfig(),
 | 
				
			||||||
 | 
					        scheduler_config=SchedulerConfig(),
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # monkey patch vllm.v1.worker.gpu_model_runner.get_pp_group
 | 
				
			||||||
 | 
					    mock_pp_group = mock.MagicMock()
 | 
				
			||||||
 | 
					    mock_pp_group.world_size = 1
 | 
				
			||||||
 | 
					    with mock.patch(
 | 
				
			||||||
 | 
					        "vllm.v1.worker.gpu_model_runner.get_pp_group", return_value=mock_pp_group
 | 
				
			||||||
 | 
					    ):
 | 
				
			||||||
 | 
					        runner = GPUModelRunner(vllm_config, DEVICE)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # hack max model len
 | 
				
			||||||
 | 
					        runner.max_model_len = MAX_MODEL_LEN
 | 
				
			||||||
 | 
					        runner.drafter.max_model_len = MAX_MODEL_LEN
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        dummy_input_batch = InputBatch(
 | 
				
			||||||
 | 
					            max_num_reqs=args.num_req,
 | 
				
			||||||
 | 
					            max_model_len=MAX_MODEL_LEN,
 | 
				
			||||||
 | 
					            max_num_batched_tokens=args.num_req * args.num_token,
 | 
				
			||||||
 | 
					            device=DEVICE,
 | 
				
			||||||
 | 
					            pin_memory=False,
 | 
				
			||||||
 | 
					            vocab_size=256000,
 | 
				
			||||||
 | 
					            block_sizes=[16],
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        dummy_input_batch._req_ids = list(str(id) for id in range(args.num_req))
 | 
				
			||||||
 | 
					        dummy_input_batch.spec_decode_unsupported_reqs = ()
 | 
				
			||||||
 | 
					        dummy_input_batch.num_tokens_no_spec = [args.num_token] * args.num_req
 | 
				
			||||||
 | 
					        dummy_input_batch.token_ids_cpu = np.random.randint(
 | 
				
			||||||
 | 
					            0, 20, (args.num_req, args.num_token)
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        runner.input_batch = dummy_input_batch
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        sampled_token_ids = [[0]] * args.num_req
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        print("Starting benchmark")
 | 
				
			||||||
 | 
					        # first run is warmup so ignore it
 | 
				
			||||||
 | 
					        for _ in range(args.num_iteration):
 | 
				
			||||||
 | 
					            start = time.time()
 | 
				
			||||||
 | 
					            runner.drafter.propose(
 | 
				
			||||||
 | 
					                sampled_token_ids,
 | 
				
			||||||
 | 
					                dummy_input_batch.req_ids,
 | 
				
			||||||
 | 
					                dummy_input_batch.num_tokens_no_spec,
 | 
				
			||||||
 | 
					                dummy_input_batch.token_ids_cpu,
 | 
				
			||||||
 | 
					                dummy_input_batch.spec_decode_unsupported_reqs,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            end = time.time()
 | 
				
			||||||
 | 
					            print(f"Iteration time (s): {end - start}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def invoke_main() -> None:
 | 
					def invoke_main() -> None:
 | 
				
			||||||
    parser = FlexibleArgumentParser(
 | 
					    parser = FlexibleArgumentParser(
 | 
				
			||||||
        description="Benchmark the performance of N-gram speculative decode drafting"
 | 
					        description="Benchmark the performance of N-gram speculative decode drafting"
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					    parser.add_argument(
 | 
				
			||||||
 | 
					        "--batched", action="store_true", help="consider time to prepare batch"
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
    parser.add_argument(
 | 
					    parser.add_argument(
 | 
				
			||||||
        "--num-iteration",
 | 
					        "--num-iteration",
 | 
				
			||||||
        type=int,
 | 
					        type=int,
 | 
				
			||||||
@ -105,8 +197,17 @@ def invoke_main() -> None:
 | 
				
			|||||||
        help="Number of speculative tokens to generate",
 | 
					        help="Number of speculative tokens to generate",
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    args = parser.parse_args()
 | 
					    args = parser.parse_args()
 | 
				
			||||||
    main(args)
 | 
					
 | 
				
			||||||
 | 
					    if not args.batched:
 | 
				
			||||||
 | 
					        benchmark_propose(args)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        benchmark_batched_propose(args)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					# Example command lines:
 | 
				
			||||||
 | 
					# time python3 benchmarks/benchmark_ngram_proposer.py
 | 
				
			||||||
 | 
					# time python3 benchmarks/benchmark_ngram_proposer.py --batched --num-iteration 4 --num-token 1000000 --num-req 128
 | 
				
			||||||
 | 
					"""  # noqa: E501
 | 
				
			||||||
if __name__ == "__main__":
 | 
					if __name__ == "__main__":
 | 
				
			||||||
    invoke_main()  # pragma: no cover
 | 
					    invoke_main()  # pragma: no cover
 | 
				
			||||||
 | 
				
			|||||||
@ -32,7 +32,6 @@ import dataclasses
 | 
				
			|||||||
import json
 | 
					import json
 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
import time
 | 
					import time
 | 
				
			||||||
from typing import Optional
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from transformers import PreTrainedTokenizerBase
 | 
					from transformers import PreTrainedTokenizerBase
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -80,7 +79,7 @@ def sample_requests_from_dataset(
 | 
				
			|||||||
    num_requests: int,
 | 
					    num_requests: int,
 | 
				
			||||||
    tokenizer: PreTrainedTokenizerBase,
 | 
					    tokenizer: PreTrainedTokenizerBase,
 | 
				
			||||||
    input_length_range: tuple[int, int],
 | 
					    input_length_range: tuple[int, int],
 | 
				
			||||||
    fixed_output_len: Optional[int],
 | 
					    fixed_output_len: int | None,
 | 
				
			||||||
) -> list[Request]:
 | 
					) -> list[Request]:
 | 
				
			||||||
    if fixed_output_len is not None and fixed_output_len < 4:
 | 
					    if fixed_output_len is not None and fixed_output_len < 4:
 | 
				
			||||||
        raise ValueError("output_len too small")
 | 
					        raise ValueError("output_len too small")
 | 
				
			||||||
@ -128,7 +127,7 @@ def sample_requests_from_random(
 | 
				
			|||||||
    num_requests: int,
 | 
					    num_requests: int,
 | 
				
			||||||
    tokenizer: PreTrainedTokenizerBase,
 | 
					    tokenizer: PreTrainedTokenizerBase,
 | 
				
			||||||
    input_length_range: tuple[int, int],
 | 
					    input_length_range: tuple[int, int],
 | 
				
			||||||
    fixed_output_len: Optional[int],
 | 
					    fixed_output_len: int | None,
 | 
				
			||||||
    prefix_len: int,
 | 
					    prefix_len: int,
 | 
				
			||||||
) -> list[Request]:
 | 
					) -> list[Request]:
 | 
				
			||||||
    requests = []
 | 
					    requests = []
 | 
				
			||||||
 | 
				
			|||||||
@ -7,7 +7,6 @@ import dataclasses
 | 
				
			|||||||
import json
 | 
					import json
 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
import time
 | 
					import time
 | 
				
			||||||
from typing import Optional
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
from transformers import AutoTokenizer, PreTrainedTokenizerBase
 | 
					from transformers import AutoTokenizer, PreTrainedTokenizerBase
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -24,7 +23,7 @@ def sample_requests(
 | 
				
			|||||||
    dataset_path: str,
 | 
					    dataset_path: str,
 | 
				
			||||||
    num_requests: int,
 | 
					    num_requests: int,
 | 
				
			||||||
    tokenizer: PreTrainedTokenizerBase,
 | 
					    tokenizer: PreTrainedTokenizerBase,
 | 
				
			||||||
    fixed_output_len: Optional[int],
 | 
					    fixed_output_len: int | None,
 | 
				
			||||||
) -> list[tuple[str, int, int, int]]:
 | 
					) -> list[tuple[str, int, int, int]]:
 | 
				
			||||||
    if fixed_output_len is not None and fixed_output_len < 4:
 | 
					    if fixed_output_len is not None and fixed_output_len < 4:
 | 
				
			||||||
        raise ValueError("output_len too small")
 | 
					        raise ValueError("output_len too small")
 | 
				
			||||||
 | 
				
			|||||||
@ -31,20 +31,19 @@ import time
 | 
				
			|||||||
import uuid
 | 
					import uuid
 | 
				
			||||||
import warnings
 | 
					import warnings
 | 
				
			||||||
from collections.abc import AsyncGenerator
 | 
					from collections.abc import AsyncGenerator
 | 
				
			||||||
 | 
					from contextlib import nullcontext
 | 
				
			||||||
from dataclasses import dataclass
 | 
					from dataclasses import dataclass
 | 
				
			||||||
from typing import Optional
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
import datasets
 | 
					import datasets
 | 
				
			||||||
import numpy as np
 | 
					import numpy as np
 | 
				
			||||||
import pandas as pd
 | 
					import pandas as pd
 | 
				
			||||||
from tqdm.asyncio import tqdm
 | 
					 | 
				
			||||||
from transformers import PreTrainedTokenizerBase
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from backend_request_func import (
 | 
					from backend_request_func import (
 | 
				
			||||||
    ASYNC_REQUEST_FUNCS,
 | 
					    ASYNC_REQUEST_FUNCS,
 | 
				
			||||||
    RequestFuncInput,
 | 
					    RequestFuncInput,
 | 
				
			||||||
    RequestFuncOutput,
 | 
					    RequestFuncOutput,
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					from tqdm.asyncio import tqdm
 | 
				
			||||||
 | 
					from transformers import PreTrainedTokenizerBase
 | 
				
			||||||
 | 
					
 | 
				
			||||||
try:
 | 
					try:
 | 
				
			||||||
    from vllm.transformers_utils.tokenizer import get_tokenizer
 | 
					    from vllm.transformers_utils.tokenizer import get_tokenizer
 | 
				
			||||||
@ -317,7 +316,7 @@ def calculate_metrics(
 | 
				
			|||||||
    tokenizer: PreTrainedTokenizerBase,
 | 
					    tokenizer: PreTrainedTokenizerBase,
 | 
				
			||||||
    selected_percentile_metrics: list[str],
 | 
					    selected_percentile_metrics: list[str],
 | 
				
			||||||
    selected_percentiles: list[float],
 | 
					    selected_percentiles: list[float],
 | 
				
			||||||
    goodput_config_dict: Optional[dict[str, float]] = None,
 | 
					    goodput_config_dict: dict[str, float] | None = None,
 | 
				
			||||||
) -> tuple[BenchmarkMetrics, list[int]]:
 | 
					) -> tuple[BenchmarkMetrics, list[int]]:
 | 
				
			||||||
    actual_output_lens: list[int] = []
 | 
					    actual_output_lens: list[int] = []
 | 
				
			||||||
    total_input = 0
 | 
					    total_input = 0
 | 
				
			||||||
@ -437,9 +436,9 @@ async def benchmark(
 | 
				
			|||||||
    selected_percentile_metrics: list[str],
 | 
					    selected_percentile_metrics: list[str],
 | 
				
			||||||
    selected_percentiles: list[str],
 | 
					    selected_percentiles: list[str],
 | 
				
			||||||
    ignore_eos: bool,
 | 
					    ignore_eos: bool,
 | 
				
			||||||
    max_concurrency: Optional[int],
 | 
					    max_concurrency: int | None,
 | 
				
			||||||
    structured_output_ratio: float,
 | 
					    structured_output_ratio: float,
 | 
				
			||||||
    goodput_config_dict: Optional[dict[str, float]] = None,
 | 
					    goodput_config_dict: dict[str, float] | None = None,
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    if backend in ASYNC_REQUEST_FUNCS:
 | 
					    if backend in ASYNC_REQUEST_FUNCS:
 | 
				
			||||||
        request_func = ASYNC_REQUEST_FUNCS[backend]
 | 
					        request_func = ASYNC_REQUEST_FUNCS[backend]
 | 
				
			||||||
@ -449,7 +448,8 @@ async def benchmark(
 | 
				
			|||||||
    def prepare_extra_body(request) -> dict:
 | 
					    def prepare_extra_body(request) -> dict:
 | 
				
			||||||
        extra_body = {}
 | 
					        extra_body = {}
 | 
				
			||||||
        # Add the schema to the extra_body
 | 
					        # Add the schema to the extra_body
 | 
				
			||||||
        extra_body[request.structure_type] = request.schema
 | 
					        extra_body["structured_outputs"] = {}
 | 
				
			||||||
 | 
					        extra_body["structured_outputs"][request.structure_type] = request.schema
 | 
				
			||||||
        return extra_body
 | 
					        return extra_body
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    print("Starting initial single prompt test run...")
 | 
					    print("Starting initial single prompt test run...")
 | 
				
			||||||
@ -502,15 +502,9 @@ async def benchmark(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
 | 
					    pbar = None if disable_tqdm else tqdm(total=len(input_requests))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # This can be used once the minimum Python version is 3.10 or higher,
 | 
					    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext()
 | 
				
			||||||
    # and it will simplify the code in limited_request_func.
 | 
					 | 
				
			||||||
    #    semaphore = (asyncio.Semaphore(max_concurrency)
 | 
					 | 
				
			||||||
    #                 if max_concurrency else contextlib.nullcontext())
 | 
					 | 
				
			||||||
    semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    async def limited_request_func(request_func_input, pbar):
 | 
					    async def limited_request_func(request_func_input, pbar):
 | 
				
			||||||
        if semaphore is None:
 | 
					 | 
				
			||||||
            return await request_func(request_func_input=request_func_input, pbar=pbar)
 | 
					 | 
				
			||||||
        async with semaphore:
 | 
					        async with semaphore:
 | 
				
			||||||
            return await request_func(request_func_input=request_func_input, pbar=pbar)
 | 
					            return await request_func(request_func_input=request_func_input, pbar=pbar)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -696,11 +690,11 @@ def evaluate(ret, args):
 | 
				
			|||||||
        return re.match(args.regex, actual) is not None
 | 
					        return re.match(args.regex, actual) is not None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _eval_correctness(expected, actual):
 | 
					    def _eval_correctness(expected, actual):
 | 
				
			||||||
        if args.structure_type == "guided_json":
 | 
					        if args.structure_type == "json":
 | 
				
			||||||
            return _eval_correctness_json(expected, actual)
 | 
					            return _eval_correctness_json(expected, actual)
 | 
				
			||||||
        elif args.structure_type == "guided_regex":
 | 
					        elif args.structure_type == "regex":
 | 
				
			||||||
            return _eval_correctness_regex(expected, actual)
 | 
					            return _eval_correctness_regex(expected, actual)
 | 
				
			||||||
        elif args.structure_type == "guided_choice":
 | 
					        elif args.structure_type == "choice":
 | 
				
			||||||
            return _eval_correctness_choice(expected, actual)
 | 
					            return _eval_correctness_choice(expected, actual)
 | 
				
			||||||
        else:
 | 
					        else:
 | 
				
			||||||
            return None
 | 
					            return None
 | 
				
			||||||
@ -780,18 +774,18 @@ def main(args: argparse.Namespace):
 | 
				
			|||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if args.dataset == "grammar":
 | 
					    if args.dataset == "grammar":
 | 
				
			||||||
        args.structure_type = "guided_grammar"
 | 
					        args.structure_type = "grammar"
 | 
				
			||||||
    elif args.dataset == "regex":
 | 
					    elif args.dataset == "regex":
 | 
				
			||||||
        args.structure_type = "guided_regex"
 | 
					        args.structure_type = "regex"
 | 
				
			||||||
    elif args.dataset == "choice":
 | 
					    elif args.dataset == "choice":
 | 
				
			||||||
        args.structure_type = "guided_choice"
 | 
					        args.structure_type = "choice"
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        args.structure_type = "guided_json"
 | 
					        args.structure_type = "json"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if args.no_structured_output:
 | 
					    if args.no_structured_output:
 | 
				
			||||||
        args.structured_output_ratio = 0
 | 
					        args.structured_output_ratio = 0
 | 
				
			||||||
    if args.save_results:
 | 
					    if args.save_results:
 | 
				
			||||||
        result_file_name = f"{args.structured_output_ratio}guided"
 | 
					        result_file_name = f"{args.structured_output_ratio}so"
 | 
				
			||||||
        result_file_name += f"_{backend}"
 | 
					        result_file_name += f"_{backend}"
 | 
				
			||||||
        result_file_name += f"_{args.request_rate}qps"
 | 
					        result_file_name += f"_{args.request_rate}qps"
 | 
				
			||||||
        result_file_name += f"_{args.model.split('/')[-1]}"
 | 
					        result_file_name += f"_{args.model.split('/')[-1]}"
 | 
				
			||||||
@ -909,13 +903,13 @@ def create_argument_parser():
 | 
				
			|||||||
    parser.add_argument(
 | 
					    parser.add_argument(
 | 
				
			||||||
        "--tokenizer",
 | 
					        "--tokenizer",
 | 
				
			||||||
        type=str,
 | 
					        type=str,
 | 
				
			||||||
        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
 | 
					        help="Name or path of the tokenizer, if not using the default tokenizer.",
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    parser.add_argument(
 | 
					    parser.add_argument(
 | 
				
			||||||
        "--tokenizer-mode",
 | 
					        "--tokenizer-mode",
 | 
				
			||||||
        type=str,
 | 
					        type=str,
 | 
				
			||||||
        default="auto",
 | 
					        default="auto",
 | 
				
			||||||
        help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501
 | 
					        help="Name or path of the tokenizer, if not using the default tokenizer.",
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    parser.add_argument(
 | 
					    parser.add_argument(
 | 
				
			||||||
        "--num-prompts",
 | 
					        "--num-prompts",
 | 
				
			||||||
 | 
				
			|||||||
@ -6,7 +6,7 @@ import math
 | 
				
			|||||||
import os
 | 
					import os
 | 
				
			||||||
import time
 | 
					import time
 | 
				
			||||||
from types import TracebackType
 | 
					from types import TracebackType
 | 
				
			||||||
from typing import Any, Optional, Union
 | 
					from typing import Any
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def convert_to_pytorch_benchmark_format(
 | 
					def convert_to_pytorch_benchmark_format(
 | 
				
			||||||
@ -92,7 +92,7 @@ class TimeCollector:
 | 
				
			|||||||
    def __init__(self, scale: int) -> None:
 | 
					    def __init__(self, scale: int) -> None:
 | 
				
			||||||
        self.cnt: int = 0
 | 
					        self.cnt: int = 0
 | 
				
			||||||
        self._sum: int = 0
 | 
					        self._sum: int = 0
 | 
				
			||||||
        self._max: Optional[int] = None
 | 
					        self._max: int | None = None
 | 
				
			||||||
        self.scale = scale
 | 
					        self.scale = scale
 | 
				
			||||||
        self.start_time: int = time.monotonic_ns()
 | 
					        self.start_time: int = time.monotonic_ns()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -104,13 +104,13 @@ class TimeCollector:
 | 
				
			|||||||
        else:
 | 
					        else:
 | 
				
			||||||
            self._max = max(self._max, v)
 | 
					            self._max = max(self._max, v)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def avg(self) -> Union[float, str]:
 | 
					    def avg(self) -> float | str:
 | 
				
			||||||
        return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
 | 
					        return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def max(self) -> Union[float, str]:
 | 
					    def max(self) -> float | str:
 | 
				
			||||||
        return self._max / self.scale if self._max else "N/A"
 | 
					        return self._max / self.scale if self._max else "N/A"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def dump_avg_max(self) -> list[Union[float, str]]:
 | 
					    def dump_avg_max(self) -> list[float | str]:
 | 
				
			||||||
        return [self.avg(), self.max()]
 | 
					        return [self.avg(), self.max()]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __enter__(self) -> None:
 | 
					    def __enter__(self) -> None:
 | 
				
			||||||
@ -118,8 +118,8 @@ class TimeCollector:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    def __exit__(
 | 
					    def __exit__(
 | 
				
			||||||
        self,
 | 
					        self,
 | 
				
			||||||
        exc_type: Optional[type[BaseException]],
 | 
					        exc_type: type[BaseException] | None,
 | 
				
			||||||
        exc_value: Optional[BaseException],
 | 
					        exc_value: BaseException | None,
 | 
				
			||||||
        exc_traceback: Optional[TracebackType],
 | 
					        exc_traceback: TracebackType | None,
 | 
				
			||||||
    ) -> None:
 | 
					    ) -> None:
 | 
				
			||||||
        self.collect(time.monotonic_ns() - self.start_time)
 | 
					        self.collect(time.monotonic_ns() - self.start_time)
 | 
				
			||||||
 | 
				
			|||||||
@ -6,8 +6,7 @@ import copy
 | 
				
			|||||||
import itertools
 | 
					import itertools
 | 
				
			||||||
import pickle as pkl
 | 
					import pickle as pkl
 | 
				
			||||||
import time
 | 
					import time
 | 
				
			||||||
from collections.abc import Iterable
 | 
					from collections.abc import Callable, Iterable
 | 
				
			||||||
from typing import Callable
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
import torch
 | 
					import torch
 | 
				
			||||||
import torch.utils.benchmark as TBenchmark
 | 
					import torch.utils.benchmark as TBenchmark
 | 
				
			||||||
 | 
				
			|||||||
@ -6,8 +6,7 @@ import copy
 | 
				
			|||||||
import itertools
 | 
					import itertools
 | 
				
			||||||
import pickle as pkl
 | 
					import pickle as pkl
 | 
				
			||||||
import time
 | 
					import time
 | 
				
			||||||
from collections.abc import Iterable
 | 
					from collections.abc import Callable, Iterable
 | 
				
			||||||
from typing import Callable, Optional
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
import torch
 | 
					import torch
 | 
				
			||||||
import torch.utils.benchmark as TBenchmark
 | 
					import torch.utils.benchmark as TBenchmark
 | 
				
			||||||
@ -17,7 +16,7 @@ from weight_shapes import WEIGHT_SHAPES
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
from vllm import _custom_ops as ops
 | 
					from vllm import _custom_ops as ops
 | 
				
			||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
 | 
					from vllm.model_executor.layers.quantization.utils.fp8_utils import (
 | 
				
			||||||
    w8a8_block_fp8_matmul,
 | 
					    w8a8_triton_block_scaled_mm,
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
from vllm.utils import FlexibleArgumentParser, cdiv
 | 
					from vllm.utils import FlexibleArgumentParser, cdiv
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -53,7 +52,7 @@ def bench_int8(
 | 
				
			|||||||
    n: int,
 | 
					    n: int,
 | 
				
			||||||
    label: str,
 | 
					    label: str,
 | 
				
			||||||
    sub_label: str,
 | 
					    sub_label: str,
 | 
				
			||||||
    bench_kernels: Optional[list[str]] = None,
 | 
					    bench_kernels: list[str] | None = None,
 | 
				
			||||||
) -> Iterable[TMeasurement]:
 | 
					) -> Iterable[TMeasurement]:
 | 
				
			||||||
    """Benchmark INT8-based kernels."""
 | 
					    """Benchmark INT8-based kernels."""
 | 
				
			||||||
    assert dtype == torch.int8
 | 
					    assert dtype == torch.int8
 | 
				
			||||||
@ -108,7 +107,7 @@ def bench_fp8(
 | 
				
			|||||||
    n: int,
 | 
					    n: int,
 | 
				
			||||||
    label: str,
 | 
					    label: str,
 | 
				
			||||||
    sub_label: str,
 | 
					    sub_label: str,
 | 
				
			||||||
    bench_kernels: Optional[list[str]] = None,
 | 
					    bench_kernels: list[str] | None = None,
 | 
				
			||||||
) -> Iterable[TMeasurement]:
 | 
					) -> Iterable[TMeasurement]:
 | 
				
			||||||
    """Benchmark FP8-based kernels."""
 | 
					    """Benchmark FP8-based kernels."""
 | 
				
			||||||
    assert dtype == torch.float8_e4m3fn
 | 
					    assert dtype == torch.float8_e4m3fn
 | 
				
			||||||
@ -158,7 +157,7 @@ def bench_fp8(
 | 
				
			|||||||
        "cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
 | 
					        "cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm(
 | 
				
			||||||
            a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
 | 
					            a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16)
 | 
				
			||||||
        ),
 | 
					        ),
 | 
				
			||||||
        "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul(
 | 
					        "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm(
 | 
				
			||||||
            a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
 | 
					            a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128)
 | 
				
			||||||
        ),
 | 
					        ),
 | 
				
			||||||
        "cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
 | 
					        "cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm(
 | 
				
			||||||
@ -183,7 +182,7 @@ def bench(
 | 
				
			|||||||
    n: int,
 | 
					    n: int,
 | 
				
			||||||
    label: str,
 | 
					    label: str,
 | 
				
			||||||
    sub_label: str,
 | 
					    sub_label: str,
 | 
				
			||||||
    bench_kernels: Optional[list[str]] = None,
 | 
					    bench_kernels: list[str] | None = None,
 | 
				
			||||||
) -> Iterable[TMeasurement]:
 | 
					) -> Iterable[TMeasurement]:
 | 
				
			||||||
    if dtype == torch.int8:
 | 
					    if dtype == torch.int8:
 | 
				
			||||||
        return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
 | 
					        return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
 | 
				
			||||||
@ -201,7 +200,7 @@ def print_timers(timers: Iterable[TMeasurement]):
 | 
				
			|||||||
def run(
 | 
					def run(
 | 
				
			||||||
    dtype: torch.dtype,
 | 
					    dtype: torch.dtype,
 | 
				
			||||||
    MKNs: Iterable[tuple[int, int, int]],
 | 
					    MKNs: Iterable[tuple[int, int, int]],
 | 
				
			||||||
    bench_kernels: Optional[list[str]] = None,
 | 
					    bench_kernels: list[str] | None = None,
 | 
				
			||||||
) -> Iterable[TMeasurement]:
 | 
					) -> Iterable[TMeasurement]:
 | 
				
			||||||
    results = []
 | 
					    results = []
 | 
				
			||||||
    for m, k, n in MKNs:
 | 
					    for m, k, n in MKNs:
 | 
				
			||||||
 | 
				
			|||||||
@ -55,9 +55,7 @@ benchmark() {
 | 
				
			|||||||
  output_len=$2
 | 
					  output_len=$2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  CUDA_VISIBLE_DEVICES=0 python3 \
 | 
					  CUDA_VISIBLE_DEVICES=0 vllm serve $model \
 | 
				
			||||||
    -m vllm.entrypoints.openai.api_server \
 | 
					 | 
				
			||||||
    --model $model \
 | 
					 | 
				
			||||||
    --port 8100 \
 | 
					    --port 8100 \
 | 
				
			||||||
    --max-model-len 10000 \
 | 
					    --max-model-len 10000 \
 | 
				
			||||||
    --gpu-memory-utilization 0.6 \
 | 
					    --gpu-memory-utilization 0.6 \
 | 
				
			||||||
@ -65,9 +63,7 @@ benchmark() {
 | 
				
			|||||||
    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
 | 
					    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  CUDA_VISIBLE_DEVICES=1 python3 \
 | 
					  CUDA_VISIBLE_DEVICES=1 vllm serve $model \
 | 
				
			||||||
    -m vllm.entrypoints.openai.api_server \
 | 
					 | 
				
			||||||
    --model $model \
 | 
					 | 
				
			||||||
    --port 8200 \
 | 
					    --port 8200 \
 | 
				
			||||||
    --max-model-len 10000 \
 | 
					    --max-model-len 10000 \
 | 
				
			||||||
    --gpu-memory-utilization 0.6 \
 | 
					    --gpu-memory-utilization 0.6 \
 | 
				
			||||||
 | 
				
			|||||||
@ -38,16 +38,12 @@ wait_for_server() {
 | 
				
			|||||||
launch_chunked_prefill() {
 | 
					launch_chunked_prefill() {
 | 
				
			||||||
  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
 | 
					  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
 | 
				
			||||||
  # disagg prefill
 | 
					  # disagg prefill
 | 
				
			||||||
  CUDA_VISIBLE_DEVICES=0 python3 \
 | 
					  CUDA_VISIBLE_DEVICES=0 vllm serve $model \
 | 
				
			||||||
    -m vllm.entrypoints.openai.api_server \
 | 
					 | 
				
			||||||
    --model $model \
 | 
					 | 
				
			||||||
    --port 8100 \
 | 
					    --port 8100 \
 | 
				
			||||||
    --max-model-len 10000 \
 | 
					    --max-model-len 10000 \
 | 
				
			||||||
    --enable-chunked-prefill \
 | 
					    --enable-chunked-prefill \
 | 
				
			||||||
    --gpu-memory-utilization 0.6 &
 | 
					    --gpu-memory-utilization 0.6 &
 | 
				
			||||||
  CUDA_VISIBLE_DEVICES=1 python3 \
 | 
					  CUDA_VISIBLE_DEVICES=1 vllm serve $model \
 | 
				
			||||||
    -m vllm.entrypoints.openai.api_server \
 | 
					 | 
				
			||||||
    --model $model \
 | 
					 | 
				
			||||||
    --port 8200 \
 | 
					    --port 8200 \
 | 
				
			||||||
    --max-model-len 10000 \
 | 
					    --max-model-len 10000 \
 | 
				
			||||||
    --enable-chunked-prefill \
 | 
					    --enable-chunked-prefill \
 | 
				
			||||||
@ -62,18 +58,14 @@ launch_chunked_prefill() {
 | 
				
			|||||||
launch_disagg_prefill() {
 | 
					launch_disagg_prefill() {
 | 
				
			||||||
  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
 | 
					  model="meta-llama/Meta-Llama-3.1-8B-Instruct"
 | 
				
			||||||
  # disagg prefill
 | 
					  # disagg prefill
 | 
				
			||||||
  CUDA_VISIBLE_DEVICES=0 python3 \
 | 
					  CUDA_VISIBLE_DEVICES=0 vllm serve $model \
 | 
				
			||||||
    -m vllm.entrypoints.openai.api_server \
 | 
					 | 
				
			||||||
    --model $model \
 | 
					 | 
				
			||||||
    --port 8100 \
 | 
					    --port 8100 \
 | 
				
			||||||
    --max-model-len 10000 \
 | 
					    --max-model-len 10000 \
 | 
				
			||||||
    --gpu-memory-utilization 0.6 \
 | 
					    --gpu-memory-utilization 0.6 \
 | 
				
			||||||
    --kv-transfer-config \
 | 
					    --kv-transfer-config \
 | 
				
			||||||
    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
 | 
					    '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' &
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  CUDA_VISIBLE_DEVICES=1 python3 \
 | 
					  CUDA_VISIBLE_DEVICES=1 vllm serve $model \
 | 
				
			||||||
    -m vllm.entrypoints.openai.api_server \
 | 
					 | 
				
			||||||
    --model $model \
 | 
					 | 
				
			||||||
    --port 8200 \
 | 
					    --port 8200 \
 | 
				
			||||||
    --max-model-len 10000 \
 | 
					    --max-model-len 10000 \
 | 
				
			||||||
    --gpu-memory-utilization 0.6 \
 | 
					    --gpu-memory-utilization 0.6 \
 | 
				
			||||||
 | 
				
			|||||||
@ -3,10 +3,9 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
import pickle as pkl
 | 
					import pickle as pkl
 | 
				
			||||||
import time
 | 
					import time
 | 
				
			||||||
from collections.abc import Iterable
 | 
					from collections.abc import Callable, Iterable
 | 
				
			||||||
from dataclasses import dataclass
 | 
					from dataclasses import dataclass
 | 
				
			||||||
from itertools import product
 | 
					from itertools import product
 | 
				
			||||||
from typing import Callable, Optional
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
import torch
 | 
					import torch
 | 
				
			||||||
import torch.utils.benchmark as TBenchmark
 | 
					import torch.utils.benchmark as TBenchmark
 | 
				
			||||||
@ -51,7 +50,7 @@ def get_bench_params() -> list[bench_params_t]:
 | 
				
			|||||||
def unfused_int8_impl(
 | 
					def unfused_int8_impl(
 | 
				
			||||||
    rms_norm_layer: RMSNorm,
 | 
					    rms_norm_layer: RMSNorm,
 | 
				
			||||||
    x: torch.Tensor,
 | 
					    x: torch.Tensor,
 | 
				
			||||||
    residual: Optional[torch.Tensor],
 | 
					    residual: torch.Tensor | None,
 | 
				
			||||||
    quant_dtype: torch.dtype,
 | 
					    quant_dtype: torch.dtype,
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    # Norm
 | 
					    # Norm
 | 
				
			||||||
@ -68,7 +67,7 @@ def unfused_int8_impl(
 | 
				
			|||||||
def unfused_fp8_impl(
 | 
					def unfused_fp8_impl(
 | 
				
			||||||
    rms_norm_layer: RMSNorm,
 | 
					    rms_norm_layer: RMSNorm,
 | 
				
			||||||
    x: torch.Tensor,
 | 
					    x: torch.Tensor,
 | 
				
			||||||
    residual: Optional[torch.Tensor],
 | 
					    residual: torch.Tensor | None,
 | 
				
			||||||
    quant_dtype: torch.dtype,
 | 
					    quant_dtype: torch.dtype,
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    # Norm
 | 
					    # Norm
 | 
				
			||||||
@ -85,7 +84,7 @@ def unfused_fp8_impl(
 | 
				
			|||||||
def fused_impl(
 | 
					def fused_impl(
 | 
				
			||||||
    rms_norm_layer: RMSNorm,  # this stores the weights
 | 
					    rms_norm_layer: RMSNorm,  # this stores the weights
 | 
				
			||||||
    x: torch.Tensor,
 | 
					    x: torch.Tensor,
 | 
				
			||||||
    residual: Optional[torch.Tensor],
 | 
					    residual: torch.Tensor | None,
 | 
				
			||||||
    quant_dtype: torch.dtype,
 | 
					    quant_dtype: torch.dtype,
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    out, _ = ops.rms_norm_dynamic_per_token_quant(
 | 
					    out, _ = ops.rms_norm_dynamic_per_token_quant(
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										191
									
								
								benchmarks/kernels/bench_mxfp4_qutlass.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										191
									
								
								benchmarks/kernels/bench_mxfp4_qutlass.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,191 @@
 | 
				
			|||||||
 | 
					# SPDX-License-Identifier: Apache-2.0
 | 
				
			||||||
 | 
					# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
 | 
				
			||||||
 | 
					# All Rights Reserved.
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Licensed under the Apache License, Version 2.0 (the "License");
 | 
				
			||||||
 | 
					# you may not use this file except in compliance with the License.
 | 
				
			||||||
 | 
					# You may obtain a copy of the License at
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					#       http://www.apache.org/licenses/LICENSE-2.0
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Unless required by applicable law or agreed to in writing, software
 | 
				
			||||||
 | 
					# distributed under the License is distributed on an "AS IS" BASIS,
 | 
				
			||||||
 | 
					# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
				
			||||||
 | 
					# See the License for the specific language governing permissions and
 | 
				
			||||||
 | 
					# limitations under the License.
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import argparse
 | 
				
			||||||
 | 
					import copy
 | 
				
			||||||
 | 
					import itertools
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import torch
 | 
				
			||||||
 | 
					from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
 | 
				
			||||||
 | 
					from weight_shapes import WEIGHT_SHAPES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn
 | 
				
			||||||
 | 
					from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
 | 
				
			||||||
 | 
					from vllm.triton_utils import triton
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					PROVIDER_CFGS = {
 | 
				
			||||||
 | 
					    "torch-bf16": dict(enabled=True),
 | 
				
			||||||
 | 
					    "mxfp4": dict(no_a_quant=False, enabled=True),
 | 
				
			||||||
 | 
					    "mxfp4-noquant": dict(no_a_quant=True, enabled=True),
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
 | 
				
			||||||
 | 
					    return (
 | 
				
			||||||
 | 
					        deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
 | 
				
			||||||
 | 
					        * group_size**-0.5
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _quant_weight_mxfp4(
 | 
				
			||||||
 | 
					    b: torch.Tensor, forward_hadamard_matrix: torch.Tensor, device: str
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeMx(
 | 
				
			||||||
 | 
					        b, forward_hadamard_matrix, method="abs_max"
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton")
 | 
				
			||||||
 | 
					    return weight_hf_e2m1, weight_hf_scale_block
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def build_mxfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device):
 | 
				
			||||||
 | 
					    weight_hf_e2m1, weight_hf_scale_block = _quant_weight_mxfp4(
 | 
				
			||||||
 | 
					        b, forward_hadamard_matrix, device
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    alpha = torch.tensor([1.0], device="cuda")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if cfg["no_a_quant"]:
 | 
				
			||||||
 | 
					        # Pre-quantize activation
 | 
				
			||||||
 | 
					        input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
 | 
				
			||||||
 | 
					            a, forward_hadamard_matrix, method="abs_max"
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def run():
 | 
				
			||||||
 | 
					            return matmul_mxf4_bf16_tn(
 | 
				
			||||||
 | 
					                input_hf_e2m1,
 | 
				
			||||||
 | 
					                weight_hf_e2m1,
 | 
				
			||||||
 | 
					                input_hf_scale_block,
 | 
				
			||||||
 | 
					                weight_hf_scale_block,
 | 
				
			||||||
 | 
					                alpha,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return run
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Quantize activation on-the-fly
 | 
				
			||||||
 | 
					    def run():
 | 
				
			||||||
 | 
					        input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx(
 | 
				
			||||||
 | 
					            a, forward_hadamard_matrix, method="abs_max"
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton")
 | 
				
			||||||
 | 
					        return matmul_mxf4_bf16_tn(
 | 
				
			||||||
 | 
					            input_hf_e2m1,
 | 
				
			||||||
 | 
					            weight_hf_e2m1,
 | 
				
			||||||
 | 
					            input_hf_scale_block,
 | 
				
			||||||
 | 
					            weight_hf_scale_block,
 | 
				
			||||||
 | 
					            alpha,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return run
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@triton.testing.perf_report(
 | 
				
			||||||
 | 
					    triton.testing.Benchmark(
 | 
				
			||||||
 | 
					        x_names=["batch_size"],
 | 
				
			||||||
 | 
					        x_vals=[
 | 
				
			||||||
 | 
					            1,
 | 
				
			||||||
 | 
					            4,
 | 
				
			||||||
 | 
					            8,
 | 
				
			||||||
 | 
					            16,
 | 
				
			||||||
 | 
					            32,
 | 
				
			||||||
 | 
					            64,
 | 
				
			||||||
 | 
					            128,
 | 
				
			||||||
 | 
					            256,
 | 
				
			||||||
 | 
					            512,
 | 
				
			||||||
 | 
					            1024,
 | 
				
			||||||
 | 
					            2048,
 | 
				
			||||||
 | 
					            4096,
 | 
				
			||||||
 | 
					            8192,
 | 
				
			||||||
 | 
					            16384,
 | 
				
			||||||
 | 
					            24576,
 | 
				
			||||||
 | 
					            32768,
 | 
				
			||||||
 | 
					        ],
 | 
				
			||||||
 | 
					        x_log=False,
 | 
				
			||||||
 | 
					        line_arg="provider",
 | 
				
			||||||
 | 
					        line_vals=_enabled,
 | 
				
			||||||
 | 
					        line_names=_enabled,
 | 
				
			||||||
 | 
					        ylabel="TFLOP/s (larger is better)",
 | 
				
			||||||
 | 
					        plot_name="BF16 vs MXFP4 GEMMs",
 | 
				
			||||||
 | 
					        args={},
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					def benchmark(batch_size, provider, N, K, had_size):
 | 
				
			||||||
 | 
					    M = batch_size
 | 
				
			||||||
 | 
					    device = "cuda"
 | 
				
			||||||
 | 
					    dtype = torch.bfloat16
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    a = torch.randn((M, K), device=device, dtype=dtype)
 | 
				
			||||||
 | 
					    b = torch.randn((N, K), device=device, dtype=dtype)
 | 
				
			||||||
 | 
					    forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    quantiles = [0.5, 0.2, 0.8]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if provider == "torch-bf16":
 | 
				
			||||||
 | 
					        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
 | 
				
			||||||
 | 
					            lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        cfg = PROVIDER_CFGS[provider]
 | 
				
			||||||
 | 
					        run_quant = build_mxfp4_runner(
 | 
				
			||||||
 | 
					            cfg, a, b, forward_hadamard_matrix, dtype, device
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
 | 
				
			||||||
 | 
					            lambda: run_quant(), rep=200, quantiles=quantiles
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
 | 
				
			||||||
 | 
					    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def prepare_shapes(args):
 | 
				
			||||||
 | 
					    out = []
 | 
				
			||||||
 | 
					    for model, tp_size in itertools.product(args.models, args.tp_sizes):
 | 
				
			||||||
 | 
					        for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
 | 
				
			||||||
 | 
					            KN[tp_dim] //= tp_size
 | 
				
			||||||
 | 
					            KN.append(model)
 | 
				
			||||||
 | 
					            out.append(KN)
 | 
				
			||||||
 | 
					    return out
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == "__main__":
 | 
				
			||||||
 | 
					    parser = argparse.ArgumentParser()
 | 
				
			||||||
 | 
					    parser.add_argument(
 | 
				
			||||||
 | 
					        "--models",
 | 
				
			||||||
 | 
					        nargs="+",
 | 
				
			||||||
 | 
					        type=str,
 | 
				
			||||||
 | 
					        default=["meta-llama/Llama-3.3-70B-Instruct"],
 | 
				
			||||||
 | 
					        choices=list(WEIGHT_SHAPES.keys()),
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
 | 
				
			||||||
 | 
					    args = parser.parse_args()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for K, N, model in prepare_shapes(args):
 | 
				
			||||||
 | 
					        for had_size in [32, 64, 128]:
 | 
				
			||||||
 | 
					            print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs MXFP4 GEMMs TFLOP/s:")
 | 
				
			||||||
 | 
					            benchmark.run(
 | 
				
			||||||
 | 
					                print_data=True,
 | 
				
			||||||
 | 
					                show_plots=True,
 | 
				
			||||||
 | 
					                save_path=f"bench_mxfp4_res_n{N}_k{K}",
 | 
				
			||||||
 | 
					                N=N,
 | 
				
			||||||
 | 
					                K=K,
 | 
				
			||||||
 | 
					                had_size=had_size,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    print("Benchmark finished!")
 | 
				
			||||||
@ -3,6 +3,7 @@
 | 
				
			|||||||
import argparse
 | 
					import argparse
 | 
				
			||||||
import copy
 | 
					import copy
 | 
				
			||||||
import itertools
 | 
					import itertools
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import torch
 | 
					import torch
 | 
				
			||||||
from weight_shapes import WEIGHT_SHAPES
 | 
					from weight_shapes import WEIGHT_SHAPES
 | 
				
			||||||
@ -23,21 +24,45 @@ PROVIDER_CFGS = {
 | 
				
			|||||||
    "torch-bf16": dict(enabled=True),
 | 
					    "torch-bf16": dict(enabled=True),
 | 
				
			||||||
    "nvfp4": dict(no_a_quant=False, enabled=True),
 | 
					    "nvfp4": dict(no_a_quant=False, enabled=True),
 | 
				
			||||||
    "nvfp4-noquant": dict(no_a_quant=True, enabled=True),
 | 
					    "nvfp4-noquant": dict(no_a_quant=True, enabled=True),
 | 
				
			||||||
 | 
					    "fbgemm-nvfp4": dict(fbgemm=True, no_a_quant=False, enabled=True),
 | 
				
			||||||
 | 
					    "fbgemm-nvfp4-noquant": dict(fbgemm=True, no_a_quant=True, enabled=True),
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					_needs_fbgemm = any(
 | 
				
			||||||
 | 
					    v.get("fbgemm", False) for v in PROVIDER_CFGS.values() if v.get("enabled", False)
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					if _needs_fbgemm:
 | 
				
			||||||
 | 
					    try:
 | 
				
			||||||
 | 
					        from fbgemm_gpu.experimental.gemm.triton_gemm.fp4_quantize import (
 | 
				
			||||||
 | 
					            triton_scale_nvfp4_quant,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    except ImportError:
 | 
				
			||||||
 | 
					        print(
 | 
				
			||||||
 | 
					            "WARNING: FBGEMM providers are enabled but fbgemm_gpu is not installed. "
 | 
				
			||||||
 | 
					            "These providers will be skipped. Please install fbgemm_gpu with: "
 | 
				
			||||||
 | 
					            "'pip install fbgemm-gpu-genai' to run them."
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        # Disable FBGEMM providers so the benchmark can run.
 | 
				
			||||||
 | 
					        for cfg in PROVIDER_CFGS.values():
 | 
				
			||||||
 | 
					            if cfg.get("fbgemm"):
 | 
				
			||||||
 | 
					                cfg["enabled"] = False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
 | 
					_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def _quant_weight_nvfp4(b: torch.Tensor, device: str):
 | 
					def _quant_weight_nvfp4(b: torch.Tensor, device: str, cfg):
 | 
				
			||||||
    # Compute global scale for weight
 | 
					    # Compute global scale for weight
 | 
				
			||||||
    b_amax = torch.abs(b).max().to(torch.float32)
 | 
					    b_amax = torch.abs(b).max().to(torch.float32)
 | 
				
			||||||
    b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
 | 
					    b_global_scale = FLOAT8_E4M3_MAX * FLOAT4_E2M1_MAX / b_amax
 | 
				
			||||||
 | 
					    if "fbgemm" in cfg and cfg["fbgemm"]:
 | 
				
			||||||
 | 
					        b_fp4, scale_b_fp4 = triton_scale_nvfp4_quant(b, b_global_scale)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
        b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
 | 
					        b_fp4, scale_b_fp4 = ops.scaled_fp4_quant(b, b_global_scale)
 | 
				
			||||||
    return b_fp4, scale_b_fp4, b_global_scale
 | 
					    return b_fp4, scale_b_fp4, b_global_scale
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def build_nvfp4_runner(cfg, a, b, dtype, device):
 | 
					def build_nvfp4_runner(cfg, a, b, dtype, device):
 | 
				
			||||||
    b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device)
 | 
					    b_fp4, scale_b_fp4, b_global_scale = _quant_weight_nvfp4(b, device, cfg)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Compute global scale for activation
 | 
					    # Compute global scale for activation
 | 
				
			||||||
    # NOTE: This is generally provided ahead-of-time by the model checkpoint.
 | 
					    # NOTE: This is generally provided ahead-of-time by the model checkpoint.
 | 
				
			||||||
@ -46,6 +71,35 @@ def build_nvfp4_runner(cfg, a, b, dtype, device):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    # Alpha for the GEMM operation
 | 
					    # Alpha for the GEMM operation
 | 
				
			||||||
    alpha = 1.0 / (a_global_scale * b_global_scale)
 | 
					    alpha = 1.0 / (a_global_scale * b_global_scale)
 | 
				
			||||||
 | 
					    if "fbgemm" in cfg and cfg["fbgemm"]:
 | 
				
			||||||
 | 
					        if cfg["no_a_quant"]:
 | 
				
			||||||
 | 
					            a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            def run():
 | 
				
			||||||
 | 
					                return torch.ops.fbgemm.f4f4bf16(
 | 
				
			||||||
 | 
					                    a_fp4,
 | 
				
			||||||
 | 
					                    b_fp4,
 | 
				
			||||||
 | 
					                    scale_a_fp4,
 | 
				
			||||||
 | 
					                    scale_b_fp4,
 | 
				
			||||||
 | 
					                    global_scale=alpha,
 | 
				
			||||||
 | 
					                    use_mx=False,
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            return run
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            def run():
 | 
				
			||||||
 | 
					                a_fp4, scale_a_fp4 = triton_scale_nvfp4_quant(a, a_global_scale)
 | 
				
			||||||
 | 
					                return torch.ops.fbgemm.f4f4bf16(
 | 
				
			||||||
 | 
					                    a_fp4,
 | 
				
			||||||
 | 
					                    b_fp4,
 | 
				
			||||||
 | 
					                    scale_a_fp4,
 | 
				
			||||||
 | 
					                    scale_b_fp4,
 | 
				
			||||||
 | 
					                    global_scale=alpha,
 | 
				
			||||||
 | 
					                    use_mx=False,
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            return run
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if cfg["no_a_quant"]:
 | 
					    if cfg["no_a_quant"]:
 | 
				
			||||||
        # Pre-quantize activation
 | 
					        # Pre-quantize activation
 | 
				
			||||||
@ -130,10 +184,13 @@ if __name__ == "__main__":
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    for K, N, model in prepare_shapes(args):
 | 
					    for K, N, model in prepare_shapes(args):
 | 
				
			||||||
        print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:")
 | 
					        print(f"{model}, N={N} K={K}, BF16 vs NVFP4 GEMMs TFLOP/s:")
 | 
				
			||||||
 | 
					        save_dir = f"bench_nvfp4_res_n{N}_k{K}"
 | 
				
			||||||
 | 
					        os.makedirs(save_dir, exist_ok=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        benchmark.run(
 | 
					        benchmark.run(
 | 
				
			||||||
            print_data=True,
 | 
					            print_data=True,
 | 
				
			||||||
            show_plots=True,
 | 
					            show_plots=True,
 | 
				
			||||||
            save_path=f"bench_nvfp4_res_n{N}_k{K}",
 | 
					            save_path=save_dir,
 | 
				
			||||||
            N=N,
 | 
					            N=N,
 | 
				
			||||||
            K=K,
 | 
					            K=K,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										207
									
								
								benchmarks/kernels/bench_nvfp4_qutlass.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										207
									
								
								benchmarks/kernels/bench_nvfp4_qutlass.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,207 @@
 | 
				
			|||||||
 | 
					# SPDX-License-Identifier: Apache-2.0
 | 
				
			||||||
 | 
					# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at).
 | 
				
			||||||
 | 
					# All Rights Reserved.
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Licensed under the Apache License, Version 2.0 (the "License");
 | 
				
			||||||
 | 
					# you may not use this file except in compliance with the License.
 | 
				
			||||||
 | 
					# You may obtain a copy of the License at
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					#       http://www.apache.org/licenses/LICENSE-2.0
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					# Unless required by applicable law or agreed to in writing, software
 | 
				
			||||||
 | 
					# distributed under the License is distributed on an "AS IS" BASIS,
 | 
				
			||||||
 | 
					# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
				
			||||||
 | 
					# See the License for the specific language governing permissions and
 | 
				
			||||||
 | 
					# limitations under the License.
 | 
				
			||||||
 | 
					#
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import argparse
 | 
				
			||||||
 | 
					import copy
 | 
				
			||||||
 | 
					import itertools
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import torch
 | 
				
			||||||
 | 
					from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix
 | 
				
			||||||
 | 
					from weight_shapes import WEIGHT_SHAPES
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from vllm import _custom_ops as ops  # use existing nvfp4 gemm in vllm
 | 
				
			||||||
 | 
					from vllm._custom_ops import fusedQuantizeNv
 | 
				
			||||||
 | 
					from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked
 | 
				
			||||||
 | 
					from vllm.triton_utils import triton
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					PROVIDER_CFGS = {
 | 
				
			||||||
 | 
					    "torch-bf16": dict(enabled=True),
 | 
				
			||||||
 | 
					    "nvfp4": dict(no_a_quant=False, enabled=True),
 | 
				
			||||||
 | 
					    "nvfp4-noquant": dict(no_a_quant=True, enabled=True),
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device):
 | 
				
			||||||
 | 
					    return (
 | 
				
			||||||
 | 
					        deterministic_hadamard_matrix(group_size, dtype=dtype, device=device)
 | 
				
			||||||
 | 
					        * group_size**-0.5
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _quant_weight_nvfp4(
 | 
				
			||||||
 | 
					    b: torch.Tensor,
 | 
				
			||||||
 | 
					    forward_hadamard_matrix: torch.Tensor,
 | 
				
			||||||
 | 
					    global_scale: torch.Tensor,
 | 
				
			||||||
 | 
					    device: str,
 | 
				
			||||||
 | 
					    M: int,
 | 
				
			||||||
 | 
					    N: int,
 | 
				
			||||||
 | 
					    K: int,
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeNv(
 | 
				
			||||||
 | 
					        b, forward_hadamard_matrix, global_scale
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton").view(
 | 
				
			||||||
 | 
					        -1, K // 16
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    return weight_hf_e2m1, weight_hf_scale_block
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def build_nvfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K):
 | 
				
			||||||
 | 
					    alpha = torch.tensor([1.0], device="cuda")
 | 
				
			||||||
 | 
					    global_scale = torch.tensor([1.0], device="cuda")
 | 
				
			||||||
 | 
					    weight_hf_e2m1, weight_hf_scale_block = _quant_weight_nvfp4(
 | 
				
			||||||
 | 
					        b, forward_hadamard_matrix, global_scale, device, M, N, K
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if cfg["no_a_quant"]:
 | 
				
			||||||
 | 
					        # Pre-quantize activation
 | 
				
			||||||
 | 
					        input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
 | 
				
			||||||
 | 
					            a, forward_hadamard_matrix, global_scale
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
 | 
				
			||||||
 | 
					            -1, K // 16
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        def run():
 | 
				
			||||||
 | 
					            return ops.cutlass_scaled_fp4_mm(
 | 
				
			||||||
 | 
					                input_hf_e2m1,
 | 
				
			||||||
 | 
					                weight_hf_e2m1,
 | 
				
			||||||
 | 
					                input_hf_scale_block,
 | 
				
			||||||
 | 
					                weight_hf_scale_block,
 | 
				
			||||||
 | 
					                alpha,
 | 
				
			||||||
 | 
					                torch.bfloat16,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return run
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Quantize activation on-the-fly
 | 
				
			||||||
 | 
					    def run():
 | 
				
			||||||
 | 
					        input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv(
 | 
				
			||||||
 | 
					            a, forward_hadamard_matrix, global_scale
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view(
 | 
				
			||||||
 | 
					            -1, K // 16
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        return ops.cutlass_scaled_fp4_mm(
 | 
				
			||||||
 | 
					            input_hf_e2m1,
 | 
				
			||||||
 | 
					            weight_hf_e2m1,
 | 
				
			||||||
 | 
					            input_hf_scale_block,
 | 
				
			||||||
 | 
					            weight_hf_scale_block,
 | 
				
			||||||
 | 
					            alpha,
 | 
				
			||||||
 | 
					            torch.bfloat16,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return run
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@triton.testing.perf_report(
 | 
				
			||||||
 | 
					    triton.testing.Benchmark(
 | 
				
			||||||
 | 
					        x_names=["batch_size"],
 | 
				
			||||||
 | 
					        x_vals=[
 | 
				
			||||||
 | 
					            1,
 | 
				
			||||||
 | 
					            4,
 | 
				
			||||||
 | 
					            8,
 | 
				
			||||||
 | 
					            16,
 | 
				
			||||||
 | 
					            32,
 | 
				
			||||||
 | 
					            64,
 | 
				
			||||||
 | 
					            128,
 | 
				
			||||||
 | 
					            256,
 | 
				
			||||||
 | 
					            512,
 | 
				
			||||||
 | 
					            1024,
 | 
				
			||||||
 | 
					            2048,
 | 
				
			||||||
 | 
					            4096,
 | 
				
			||||||
 | 
					            8192,
 | 
				
			||||||
 | 
					            16384,
 | 
				
			||||||
 | 
					            24576,
 | 
				
			||||||
 | 
					            32768,
 | 
				
			||||||
 | 
					        ],
 | 
				
			||||||
 | 
					        x_log=False,
 | 
				
			||||||
 | 
					        line_arg="provider",
 | 
				
			||||||
 | 
					        line_vals=_enabled,
 | 
				
			||||||
 | 
					        line_names=_enabled,
 | 
				
			||||||
 | 
					        ylabel="TFLOP/s (larger is better)",
 | 
				
			||||||
 | 
					        plot_name="BF16 vs NVFP4 GEMMs",
 | 
				
			||||||
 | 
					        args={},
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					def benchmark(batch_size, provider, N, K, had_size):
 | 
				
			||||||
 | 
					    M = batch_size
 | 
				
			||||||
 | 
					    device = "cuda"
 | 
				
			||||||
 | 
					    dtype = torch.bfloat16
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    a = torch.randn((M, K), device=device, dtype=dtype)
 | 
				
			||||||
 | 
					    b = torch.randn((N, K), device=device, dtype=dtype)
 | 
				
			||||||
 | 
					    forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    quantiles = [0.5, 0.2, 0.8]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if provider == "torch-bf16":
 | 
				
			||||||
 | 
					        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
 | 
				
			||||||
 | 
					            lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        cfg = PROVIDER_CFGS[provider]
 | 
				
			||||||
 | 
					        run_quant = build_nvfp4_runner(
 | 
				
			||||||
 | 
					            cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
 | 
				
			||||||
 | 
					            lambda: run_quant(), rep=200, quantiles=quantiles
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
 | 
				
			||||||
 | 
					    return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def prepare_shapes(args):
 | 
				
			||||||
 | 
					    out = []
 | 
				
			||||||
 | 
					    for model, tp_size in itertools.product(args.models, args.tp_sizes):
 | 
				
			||||||
 | 
					        for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
 | 
				
			||||||
 | 
					            KN[tp_dim] //= tp_size
 | 
				
			||||||
 | 
					            KN.append(model)
 | 
				
			||||||
 | 
					            out.append(KN)
 | 
				
			||||||
 | 
					    return out
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == "__main__":
 | 
				
			||||||
 | 
					    parser = argparse.ArgumentParser()
 | 
				
			||||||
 | 
					    parser.add_argument(
 | 
				
			||||||
 | 
					        "--models",
 | 
				
			||||||
 | 
					        nargs="+",
 | 
				
			||||||
 | 
					        type=str,
 | 
				
			||||||
 | 
					        default=["meta-llama/Llama-3.3-70B-Instruct"],
 | 
				
			||||||
 | 
					        choices=list(WEIGHT_SHAPES.keys()),
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
 | 
				
			||||||
 | 
					    args = parser.parse_args()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for K, N, model in prepare_shapes(args):
 | 
				
			||||||
 | 
					        for had_size in [16, 32, 64, 128]:
 | 
				
			||||||
 | 
					            print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs NVFP4 GEMMs TFLOP/s:")
 | 
				
			||||||
 | 
					            benchmark.run(
 | 
				
			||||||
 | 
					                print_data=True,
 | 
				
			||||||
 | 
					                show_plots=True,
 | 
				
			||||||
 | 
					                save_path=f"bench_nvfp4_res_n{N}_k{K}",
 | 
				
			||||||
 | 
					                N=N,
 | 
				
			||||||
 | 
					                K=K,
 | 
				
			||||||
 | 
					                had_size=had_size,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    print("Benchmark finished!")
 | 
				
			||||||
@ -1,15 +1,27 @@
 | 
				
			|||||||
# SPDX-License-Identifier: Apache-2.0
 | 
					# SPDX-License-Identifier: Apache-2.0
 | 
				
			||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
					# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
				
			||||||
import itertools
 | 
					import itertools
 | 
				
			||||||
from typing import Callable
 | 
					from collections.abc import Callable
 | 
				
			||||||
 | 
					from unittest.mock import patch
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import pandas as pd
 | 
				
			||||||
import torch
 | 
					import torch
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from vllm import _custom_ops as ops
 | 
					 | 
				
			||||||
from vllm.config import CompilationConfig, VllmConfig, set_current_vllm_config
 | 
					 | 
				
			||||||
from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 | 
					from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
 | 
				
			||||||
from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 | 
					from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape
 | 
				
			||||||
from vllm.triton_utils import triton
 | 
					from vllm.triton_utils import triton
 | 
				
			||||||
 | 
					from vllm.utils import FlexibleArgumentParser
 | 
				
			||||||
 | 
					from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def with_triton_mode(fn):
 | 
				
			||||||
 | 
					    """Temporarily force the Triton fallback path"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def wrapped(*args, **kwargs):
 | 
				
			||||||
 | 
					        with patch("vllm.platforms.current_platform.is_cuda", return_value=False):
 | 
				
			||||||
 | 
					            return fn(*args, **kwargs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return wrapped
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# TODO(luka): use standalone_compile utility
 | 
					# TODO(luka): use standalone_compile utility
 | 
				
			||||||
@ -21,78 +33,238 @@ def with_dyn_arg(fn: Callable, arg_index: int, dim_index: int):
 | 
				
			|||||||
    return inner
 | 
					    return inner
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
torch._dynamo.config.recompile_limit = 8888
 | 
					def bench_compile(fn: Callable):
 | 
				
			||||||
compilation_config = CompilationConfig(custom_ops=["none"])
 | 
					    # recompile for different shapes
 | 
				
			||||||
with set_current_vllm_config(VllmConfig(compilation_config=compilation_config)):
 | 
					    fwd = torch.compile(fn, fullgraph=True, dynamic=False)
 | 
				
			||||||
    torch_per_token_quant_fp8 = torch.compile(
 | 
					 | 
				
			||||||
        QuantFP8(False, GroupShape.PER_TOKEN),
 | 
					 | 
				
			||||||
        fullgraph=True,
 | 
					 | 
				
			||||||
        dynamic=False,  # recompile for different shapes
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # First dim is explicitly dynamic to simulate vLLM usage
 | 
					    # First dim is explicitly dynamic to simulate vLLM usage
 | 
				
			||||||
    torch_per_token_quant_fp8 = with_dyn_arg(torch_per_token_quant_fp8, 0, 0)
 | 
					    return with_dyn_arg(fwd, 0, 0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def cuda_per_token_quant_fp8(
 | 
					torch._dynamo.config.recompile_limit = 8888
 | 
				
			||||||
    input: torch.Tensor,
 | 
					 | 
				
			||||||
) -> tuple[torch.Tensor, torch.Tensor]:
 | 
					 | 
				
			||||||
    return ops.scaled_fp8_quant(input)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def calculate_diff(batch_size: int, seq_len: int):
 | 
					def calculate_diff(
 | 
				
			||||||
    """Calculate difference between Triton and CUDA implementations."""
 | 
					    batch_size: int,
 | 
				
			||||||
 | 
					    hidden_size: int,
 | 
				
			||||||
 | 
					    group_shape: GroupShape,
 | 
				
			||||||
 | 
					    dtype: torch.dtype,
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    """Calculate the difference between Inductor and CUDA implementations."""
 | 
				
			||||||
    device = torch.device("cuda")
 | 
					    device = torch.device("cuda")
 | 
				
			||||||
    x = torch.rand((batch_size * seq_len, 4096), dtype=torch.float16, device=device)
 | 
					    x = torch.randn((batch_size, hidden_size), dtype=dtype, device=device)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    torch_out, torch_scale = torch_per_token_quant_fp8(x)
 | 
					    quant_fp8 = QuantFP8(False, group_shape, column_major_scales=False)
 | 
				
			||||||
    cuda_out, cuda_scale = cuda_per_token_quant_fp8(x)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if torch.allclose(
 | 
					    torch_out, torch_scale = bench_compile(quant_fp8.forward_native)(x)
 | 
				
			||||||
        cuda_out.to(torch.float32), torch_out.to(torch.float32), rtol=1e-3, atol=1e-5
 | 
					    torch_eager_out, torch_eager_scale = quant_fp8.forward_native(x)
 | 
				
			||||||
    ) and torch.allclose(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5):
 | 
					    cuda_out, cuda_scale = quant_fp8.forward_cuda(x)
 | 
				
			||||||
        print("✅ All implementations match")
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        print("❌ Implementations differ")
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    try:
 | 
				
			||||||
batch_size_range = [1, 16, 32, 64, 128]
 | 
					        torch.testing.assert_close(
 | 
				
			||||||
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
 | 
					            cuda_out.to(torch.float32),
 | 
				
			||||||
 | 
					            torch_out.to(torch.float32),
 | 
				
			||||||
configs = list(itertools.product(batch_size_range, seq_len_range))
 | 
					            rtol=1e-3,
 | 
				
			||||||
 | 
					            atol=1e-5,
 | 
				
			||||||
 | 
					 | 
				
			||||||
@triton.testing.perf_report(
 | 
					 | 
				
			||||||
    triton.testing.Benchmark(
 | 
					 | 
				
			||||||
        x_names=["batch_size", "seq_len"],
 | 
					 | 
				
			||||||
        x_vals=configs,
 | 
					 | 
				
			||||||
        line_arg="provider",
 | 
					 | 
				
			||||||
        line_vals=["torch", "cuda"],
 | 
					 | 
				
			||||||
        line_names=["Torch", "CUDA"],
 | 
					 | 
				
			||||||
        styles=[("blue", "-"), ("green", "-")],
 | 
					 | 
				
			||||||
        ylabel="us",
 | 
					 | 
				
			||||||
        plot_name="per-token-dynamic-quant-fp8-performance",
 | 
					 | 
				
			||||||
        args={},
 | 
					 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
)
 | 
					        torch.testing.assert_close(cuda_scale, torch_scale, rtol=1e-3, atol=1e-5)
 | 
				
			||||||
def benchmark_quantization(batch_size, seq_len, provider):
 | 
					        torch.testing.assert_close(
 | 
				
			||||||
    dtype = torch.float16
 | 
					            cuda_out.to(torch.float32),
 | 
				
			||||||
 | 
					            torch_eager_out.to(torch.float32),
 | 
				
			||||||
 | 
					            rtol=1e-3,
 | 
				
			||||||
 | 
					            atol=1e-5,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        torch.testing.assert_close(cuda_scale, torch_eager_scale, rtol=1e-3, atol=1e-5)
 | 
				
			||||||
 | 
					        print("✅ All implementations match")
 | 
				
			||||||
 | 
					    except AssertionError as e:
 | 
				
			||||||
 | 
					        print("❌ Implementations differ")
 | 
				
			||||||
 | 
					        print(e)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					configs = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def benchmark_quantization(
 | 
				
			||||||
 | 
					    batch_size,
 | 
				
			||||||
 | 
					    hidden_size,
 | 
				
			||||||
 | 
					    provider,
 | 
				
			||||||
 | 
					    group_shape: GroupShape,
 | 
				
			||||||
 | 
					    col_major: bool,
 | 
				
			||||||
 | 
					    dtype: torch.dtype,
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
    device = torch.device("cuda")
 | 
					    device = torch.device("cuda")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    x = torch.randn(batch_size * seq_len, 4096, device=device, dtype=dtype)
 | 
					    x = torch.randn(batch_size, hidden_size, device=device, dtype=dtype)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    quantiles = [0.5, 0.2, 0.8]
 | 
					    quantiles = [0.5, 0.2, 0.8]
 | 
				
			||||||
 | 
					    quant_fp8 = QuantFP8(False, group_shape, column_major_scales=col_major)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if provider == "torch":
 | 
					    if provider == "torch":
 | 
				
			||||||
        fn = lambda: torch_per_token_quant_fp8(x.clone())
 | 
					        fn = lambda: bench_compile(quant_fp8.forward_native)(x.clone())
 | 
				
			||||||
    elif provider == "cuda":
 | 
					    elif provider == "cuda":
 | 
				
			||||||
        fn = lambda: cuda_per_token_quant_fp8(x.clone())
 | 
					        fn = lambda: quant_fp8.forward_cuda(x.clone())
 | 
				
			||||||
 | 
					    elif provider == "triton":
 | 
				
			||||||
 | 
					        if not group_shape.is_per_group():
 | 
				
			||||||
 | 
					            # Triton only supported for per-group
 | 
				
			||||||
 | 
					            return 0, 0, 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        fn = lambda: with_triton_mode(quant_fp8.forward_cuda)(x.clone())
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(fn, quantiles=quantiles)
 | 
					    ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(fn, quantiles=quantiles)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return 1000 * ms, 1000 * max_ms, 1000 * min_ms
 | 
					    return 1000 * ms, 1000 * max_ms, 1000 * min_ms
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# TODO(luka) extract to utils
 | 
				
			||||||
 | 
					def compute_geomean_speedups(
 | 
				
			||||||
 | 
					    df: pd.DataFrame,
 | 
				
			||||||
 | 
					    baseline_col: str,
 | 
				
			||||||
 | 
					    speedup_cols: list[str],
 | 
				
			||||||
 | 
					    groupby_cols: list[str] | None = None,
 | 
				
			||||||
 | 
					) -> pd.DataFrame:
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    Compute geometric mean speedups over a baseline column.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Args:
 | 
				
			||||||
 | 
					        df: Input dataframe
 | 
				
			||||||
 | 
					        baseline_col: Column to use as baseline
 | 
				
			||||||
 | 
					        speedup_cols: Columns to compute speedups for
 | 
				
			||||||
 | 
					        groupby_cols: Columns to group by. If None, compute over entire df.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Returns:
 | 
				
			||||||
 | 
					        pd.DataFrame with geometric mean speedups
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    from scipy.stats import gmean
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def geo_speedup(group: pd.DataFrame) -> pd.Series:
 | 
				
			||||||
 | 
					        ratios = {
 | 
				
			||||||
 | 
					            col: (group[baseline_col] / group[col]).values for col in speedup_cols
 | 
				
			||||||
 | 
					        }
 | 
				
			||||||
 | 
					        return pd.Series({col: gmean(vals) for col, vals in ratios.items()})
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if groupby_cols is None:
 | 
				
			||||||
 | 
					        result = geo_speedup(df).to_frame().T
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        result = (
 | 
				
			||||||
 | 
					            df.groupby(groupby_cols)
 | 
				
			||||||
 | 
					            .apply(geo_speedup, include_groups=False)
 | 
				
			||||||
 | 
					            .reset_index()
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return result
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
if __name__ == "__main__":
 | 
					if __name__ == "__main__":
 | 
				
			||||||
    calculate_diff(batch_size=4, seq_len=4096)
 | 
					    parser = FlexibleArgumentParser(
 | 
				
			||||||
    benchmark_quantization.run(print_data=True)
 | 
					        description="Benchmark the various implementations of QuantFP8 (dynamic-only)"
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    parser.add_argument("-c", "--check", action="store_true")
 | 
				
			||||||
 | 
					    parser.add_argument(
 | 
				
			||||||
 | 
					        "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16"
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    parser.add_argument(
 | 
				
			||||||
 | 
					        "--hidden-sizes",
 | 
				
			||||||
 | 
					        type=int,
 | 
				
			||||||
 | 
					        nargs="+",
 | 
				
			||||||
 | 
					        default=[896, 1024, 2048, 4096, 7168],
 | 
				
			||||||
 | 
					        help="Hidden sizes to benchmark",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    parser.add_argument(
 | 
				
			||||||
 | 
					        "--batch-sizes",
 | 
				
			||||||
 | 
					        type=int,
 | 
				
			||||||
 | 
					        nargs="+",
 | 
				
			||||||
 | 
					        default=[1, 16, 128, 512, 1024],
 | 
				
			||||||
 | 
					        help="Batch sizes to benchmark",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    parser.add_argument(
 | 
				
			||||||
 | 
					        "--group-sizes",
 | 
				
			||||||
 | 
					        type=int,
 | 
				
			||||||
 | 
					        nargs="+",
 | 
				
			||||||
 | 
					        default=None,
 | 
				
			||||||
 | 
					        help="Group sizes for GroupShape(1,N) to benchmark. "
 | 
				
			||||||
 | 
					        "Use 0 for PER_TENSOR, -1 for PER_TOKEN (default: 0,-1,64,128)",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    parser.add_argument(
 | 
				
			||||||
 | 
					        "--no-column-major",
 | 
				
			||||||
 | 
					        action="store_true",
 | 
				
			||||||
 | 
					        help="Disable column-major scales testing",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    args = parser.parse_args()
 | 
				
			||||||
 | 
					    assert args
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    dtype = STR_DTYPE_TO_TORCH_DTYPE[args.dtype]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    hidden_sizes = args.hidden_sizes
 | 
				
			||||||
 | 
					    batch_sizes = args.batch_sizes
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if args.group_sizes is not None:
 | 
				
			||||||
 | 
					        group_shapes = []
 | 
				
			||||||
 | 
					        for size in args.group_sizes:
 | 
				
			||||||
 | 
					            if size == 0:
 | 
				
			||||||
 | 
					                group_shapes.append(GroupShape.PER_TENSOR)
 | 
				
			||||||
 | 
					            elif size == -1:
 | 
				
			||||||
 | 
					                group_shapes.append(GroupShape.PER_TOKEN)
 | 
				
			||||||
 | 
					            else:
 | 
				
			||||||
 | 
					                group_shapes.append(GroupShape(1, size))
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        group_shapes = [
 | 
				
			||||||
 | 
					            GroupShape.PER_TENSOR,
 | 
				
			||||||
 | 
					            GroupShape.PER_TOKEN,
 | 
				
			||||||
 | 
					            GroupShape(1, 64),
 | 
				
			||||||
 | 
					            GroupShape(1, 128),
 | 
				
			||||||
 | 
					        ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    column_major_scales = [False] if args.no_column_major else [True, False]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    config_gen = itertools.product(
 | 
				
			||||||
 | 
					        group_shapes,
 | 
				
			||||||
 | 
					        column_major_scales,
 | 
				
			||||||
 | 
					        batch_sizes,
 | 
				
			||||||
 | 
					        hidden_sizes,
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # filter out column-major scales for non-group, reverse order
 | 
				
			||||||
 | 
					    configs.extend(c[::-1] for c in config_gen if (c[0].is_per_group() or not c[1]))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    print(f"Running {len(configs)} configurations:")
 | 
				
			||||||
 | 
					    print(f"  Hidden sizes: {hidden_sizes}")
 | 
				
			||||||
 | 
					    print(f"  Batch sizes: {batch_sizes}")
 | 
				
			||||||
 | 
					    print(f"  Group shapes: {[str(g) for g in group_shapes]}")
 | 
				
			||||||
 | 
					    print(f"  Column major scales: {column_major_scales}")
 | 
				
			||||||
 | 
					    print()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if args.check:
 | 
				
			||||||
 | 
					        for group_shape in group_shapes:
 | 
				
			||||||
 | 
					            group_size = group_shape[1]
 | 
				
			||||||
 | 
					            print(f"{group_size=}")
 | 
				
			||||||
 | 
					            calculate_diff(
 | 
				
			||||||
 | 
					                batch_size=4, hidden_size=4096, group_shape=group_shape, dtype=dtype
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    benchmark = triton.testing.perf_report(
 | 
				
			||||||
 | 
					        triton.testing.Benchmark(
 | 
				
			||||||
 | 
					            x_names=["hidden_size", "batch_size", "col_major", "group_shape"],
 | 
				
			||||||
 | 
					            x_vals=configs,
 | 
				
			||||||
 | 
					            line_arg="provider",
 | 
				
			||||||
 | 
					            line_vals=["torch", "cuda", "triton"],
 | 
				
			||||||
 | 
					            line_names=["Torch (Compiled)", "CUDA", "Triton"],
 | 
				
			||||||
 | 
					            styles=[("blue", "-"), ("green", "-"), ("black", "-")],
 | 
				
			||||||
 | 
					            ylabel="us",
 | 
				
			||||||
 | 
					            plot_name="QuantFP8 performance",
 | 
				
			||||||
 | 
					            args={},
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    )(benchmark_quantization)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    df = benchmark.run(print_data=True, dtype=dtype, return_df=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Print geomean speedups
 | 
				
			||||||
 | 
					    geo_table_grouped = compute_geomean_speedups(
 | 
				
			||||||
 | 
					        df,
 | 
				
			||||||
 | 
					        baseline_col="Torch (Compiled)",
 | 
				
			||||||
 | 
					        speedup_cols=["CUDA", "Triton"],
 | 
				
			||||||
 | 
					        groupby_cols=["col_major", "group_shape"],
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    print("Speedup over Torch (Compiled)")
 | 
				
			||||||
 | 
					    print(geo_table_grouped.to_string(index=False))
 | 
				
			||||||
 | 
				
			|||||||
@ -10,7 +10,8 @@ import vllm.model_executor.layers.activation  # noqa F401
 | 
				
			|||||||
from vllm.model_executor.custom_op import CustomOp
 | 
					from vllm.model_executor.custom_op import CustomOp
 | 
				
			||||||
from vllm.platforms import current_platform
 | 
					from vllm.platforms import current_platform
 | 
				
			||||||
from vllm.triton_utils import triton
 | 
					from vllm.triton_utils import triton
 | 
				
			||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
 | 
					from vllm.utils import FlexibleArgumentParser
 | 
				
			||||||
 | 
					from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
batch_size_range = [1, 16, 32, 64, 128]
 | 
					batch_size_range = [1, 16, 32, 64, 128]
 | 
				
			||||||
seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
 | 
					seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096]
 | 
				
			||||||
 | 
				
			|||||||
@ -13,6 +13,10 @@ import torch.utils.benchmark as benchmark
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
from vllm import _custom_ops as ops
 | 
					from vllm import _custom_ops as ops
 | 
				
			||||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 | 
					from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 | 
				
			||||||
 | 
					from vllm.model_executor.layers.fused_moe.config import (
 | 
				
			||||||
 | 
					    fp8_w8a8_moe_quant_config,
 | 
				
			||||||
 | 
					    nvfp4_moe_quant_config,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
 | 
					from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4
 | 
				
			||||||
from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
 | 
					from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
 | 
				
			||||||
from vllm.scalar_type import scalar_types
 | 
					from vllm.scalar_type import scalar_types
 | 
				
			||||||
@ -140,6 +144,12 @@ def bench_run(
 | 
				
			|||||||
        a_fp8_scale: torch.Tensor,
 | 
					        a_fp8_scale: torch.Tensor,
 | 
				
			||||||
        num_repeats: int,
 | 
					        num_repeats: int,
 | 
				
			||||||
    ):
 | 
					    ):
 | 
				
			||||||
 | 
					        quant_config = fp8_w8a8_moe_quant_config(
 | 
				
			||||||
 | 
					            w1_scale=w1_scale,
 | 
				
			||||||
 | 
					            w2_scale=w2_scale,
 | 
				
			||||||
 | 
					            a1_scale=a_fp8_scale,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for _ in range(num_repeats):
 | 
					        for _ in range(num_repeats):
 | 
				
			||||||
            fused_experts(
 | 
					            fused_experts(
 | 
				
			||||||
                a,
 | 
					                a,
 | 
				
			||||||
@ -147,10 +157,7 @@ def bench_run(
 | 
				
			|||||||
                w2,
 | 
					                w2,
 | 
				
			||||||
                topk_weights,
 | 
					                topk_weights,
 | 
				
			||||||
                topk_ids,
 | 
					                topk_ids,
 | 
				
			||||||
                use_fp8_w8a8=True,
 | 
					                quant_config=quant_config,
 | 
				
			||||||
                w1_scale=w1_scale,
 | 
					 | 
				
			||||||
                w2_scale=w2_scale,
 | 
					 | 
				
			||||||
                a1_scale=a_fp8_scale,
 | 
					 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def run_cutlass_moe_fp4(
 | 
					    def run_cutlass_moe_fp4(
 | 
				
			||||||
@ -172,25 +179,27 @@ def bench_run(
 | 
				
			|||||||
        device: torch.device,
 | 
					        device: torch.device,
 | 
				
			||||||
        num_repeats: int,
 | 
					        num_repeats: int,
 | 
				
			||||||
    ):
 | 
					    ):
 | 
				
			||||||
 | 
					        quant_config = nvfp4_moe_quant_config(
 | 
				
			||||||
 | 
					            a1_gscale=a1_gs,
 | 
				
			||||||
 | 
					            a2_gscale=a2_gs,
 | 
				
			||||||
 | 
					            w1_scale=w1_blockscale,
 | 
				
			||||||
 | 
					            w2_scale=w2_blockscale,
 | 
				
			||||||
 | 
					            g1_alphas=w1_gs,
 | 
				
			||||||
 | 
					            g2_alphas=w2_gs,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
        for _ in range(num_repeats):
 | 
					        for _ in range(num_repeats):
 | 
				
			||||||
            with nvtx.annotate("cutlass_moe_fp4", color="green"):
 | 
					            with nvtx.annotate("cutlass_moe_fp4", color="green"):
 | 
				
			||||||
                cutlass_moe_fp4(
 | 
					                cutlass_moe_fp4(
 | 
				
			||||||
                    a=a,
 | 
					                    a=a,
 | 
				
			||||||
                    a1_gscale=a1_gs,
 | 
					 | 
				
			||||||
                    a2_gscale=a2_gs,
 | 
					 | 
				
			||||||
                    w1_fp4=w1_fp4,
 | 
					                    w1_fp4=w1_fp4,
 | 
				
			||||||
                    w1_blockscale=w1_blockscale,
 | 
					 | 
				
			||||||
                    w1_alphas=w1_gs,
 | 
					 | 
				
			||||||
                    w2_fp4=w2_fp4,
 | 
					                    w2_fp4=w2_fp4,
 | 
				
			||||||
                    w2_blockscale=w2_blockscale,
 | 
					 | 
				
			||||||
                    w2_alphas=w2_gs,
 | 
					 | 
				
			||||||
                    topk_weights=topk_weights,
 | 
					                    topk_weights=topk_weights,
 | 
				
			||||||
                    topk_ids=topk_ids,
 | 
					                    topk_ids=topk_ids,
 | 
				
			||||||
                    m=m,
 | 
					                    m=m,
 | 
				
			||||||
                    n=n,
 | 
					                    n=n,
 | 
				
			||||||
                    k=k,
 | 
					                    k=k,
 | 
				
			||||||
                    e=num_experts,
 | 
					                    e=num_experts,
 | 
				
			||||||
                    device=device,
 | 
					                    quant_config=quant_config,
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def run_cutlass_from_graph(
 | 
					    def run_cutlass_from_graph(
 | 
				
			||||||
@ -211,26 +220,29 @@ def bench_run(
 | 
				
			|||||||
        e: int,
 | 
					        e: int,
 | 
				
			||||||
        device: torch.device,
 | 
					        device: torch.device,
 | 
				
			||||||
    ):
 | 
					    ):
 | 
				
			||||||
 | 
					        quant_config = nvfp4_moe_quant_config(
 | 
				
			||||||
 | 
					            a1_gscale=a1_gs,
 | 
				
			||||||
 | 
					            a2_gscale=a2_gs,
 | 
				
			||||||
 | 
					            w1_scale=w1_blockscale,
 | 
				
			||||||
 | 
					            w2_scale=w2_blockscale,
 | 
				
			||||||
 | 
					            g1_alphas=w1_gs,
 | 
				
			||||||
 | 
					            g2_alphas=w2_gs,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        with set_current_vllm_config(
 | 
					        with set_current_vllm_config(
 | 
				
			||||||
            VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
 | 
					            VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
 | 
				
			||||||
        ):
 | 
					        ):
 | 
				
			||||||
            return cutlass_moe_fp4(
 | 
					            return cutlass_moe_fp4(
 | 
				
			||||||
                a=a,
 | 
					                a=a,
 | 
				
			||||||
                a1_gscale=a1_gs,
 | 
					 | 
				
			||||||
                w1_fp4=w1_fp4,
 | 
					                w1_fp4=w1_fp4,
 | 
				
			||||||
                w1_blockscale=w1_blockscale,
 | 
					 | 
				
			||||||
                w1_alphas=w1_alphas,
 | 
					 | 
				
			||||||
                a2_gscale=a2_gs,
 | 
					 | 
				
			||||||
                w2_fp4=w2_fp4,
 | 
					                w2_fp4=w2_fp4,
 | 
				
			||||||
                w2_blockscale=w2_blockscale,
 | 
					 | 
				
			||||||
                w2_alphas=w2_alphas,
 | 
					 | 
				
			||||||
                topk_weights=topk_weights,
 | 
					                topk_weights=topk_weights,
 | 
				
			||||||
                topk_ids=topk_ids,
 | 
					                topk_ids=topk_ids,
 | 
				
			||||||
                m=m,
 | 
					                m=m,
 | 
				
			||||||
                n=n,
 | 
					                n=n,
 | 
				
			||||||
                k=k,
 | 
					                k=k,
 | 
				
			||||||
                e=num_experts,
 | 
					                e=num_experts,
 | 
				
			||||||
                device=device,
 | 
					                quant_config=quant_config,
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def run_triton_from_graph(
 | 
					    def run_triton_from_graph(
 | 
				
			||||||
@ -246,16 +258,18 @@ def bench_run(
 | 
				
			|||||||
        with set_current_vllm_config(
 | 
					        with set_current_vllm_config(
 | 
				
			||||||
            VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
 | 
					            VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
 | 
				
			||||||
        ):
 | 
					        ):
 | 
				
			||||||
 | 
					            quant_config = fp8_w8a8_moe_quant_config(
 | 
				
			||||||
 | 
					                w1_scale=w1_scale,
 | 
				
			||||||
 | 
					                w2_scale=w2_scale,
 | 
				
			||||||
 | 
					                a1_scale=a_fp8_scale,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
            return fused_experts(
 | 
					            return fused_experts(
 | 
				
			||||||
                a,
 | 
					                a,
 | 
				
			||||||
                w1,
 | 
					                w1,
 | 
				
			||||||
                w2,
 | 
					                w2,
 | 
				
			||||||
                topk_weights,
 | 
					                topk_weights,
 | 
				
			||||||
                topk_ids,
 | 
					                topk_ids,
 | 
				
			||||||
                use_fp8_w8a8=True,
 | 
					                quant_config=quant_config,
 | 
				
			||||||
                w1_scale=w1_scale,
 | 
					 | 
				
			||||||
                w2_scale=w2_scale,
 | 
					 | 
				
			||||||
                a1_scale=a_fp8_scale,
 | 
					 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def replay_graph(graph, num_repeats):
 | 
					    def replay_graph(graph, num_repeats):
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										406
									
								
								benchmarks/kernels/benchmark_cutlass_moe_fp8.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										406
									
								
								benchmarks/kernels/benchmark_cutlass_moe_fp8.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,406 @@
 | 
				
			|||||||
 | 
					# SPDX-License-Identifier: Apache-2.0
 | 
				
			||||||
 | 
					# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					Benchmark the performance of the cutlass_moe_fp8 kernel vs the triton_moe
 | 
				
			||||||
 | 
					kernel. Both kernels take in fp8 quantized weights and 16-bit activations,
 | 
				
			||||||
 | 
					but use different quantization strategies and backends.
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import nvtx
 | 
				
			||||||
 | 
					import torch
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from vllm import _custom_ops as ops
 | 
				
			||||||
 | 
					from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
 | 
				
			||||||
 | 
					from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
 | 
				
			||||||
 | 
					from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk
 | 
				
			||||||
 | 
					from vllm.platforms import current_platform
 | 
				
			||||||
 | 
					from vllm.utils import FlexibleArgumentParser
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Weight shapes for different models: [num_experts, topk, hidden_size,
 | 
				
			||||||
 | 
					# intermediate_size]
 | 
				
			||||||
 | 
					WEIGHT_SHAPES_MOE = {
 | 
				
			||||||
 | 
					    "mixtral-8x7b": [
 | 
				
			||||||
 | 
					        [8, 2, 4096, 14336],
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					    "deepseek-v2": [
 | 
				
			||||||
 | 
					        [160, 6, 5120, 12288],
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					    "custom-small": [
 | 
				
			||||||
 | 
					        [8, 2, 2048, 7168],
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					    "glm45-fp8": [
 | 
				
			||||||
 | 
					        [128, 8, 4096, 1408],
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					    "Llama-4-Maverick-17B-128E-Instruct-FP8": [
 | 
				
			||||||
 | 
					        [128, 1, 5120, 8192],
 | 
				
			||||||
 | 
					    ],
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_MODELS = [
 | 
				
			||||||
 | 
					    "mixtral-8x7b",
 | 
				
			||||||
 | 
					]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					DEFAULT_BATCH_SIZES = [4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]
 | 
				
			||||||
 | 
					DEFAULT_TP_SIZES = [1]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					PER_ACT_TOKEN_OPTS = [False, True]
 | 
				
			||||||
 | 
					PER_OUT_CH_OPTS = [False, True]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					FP8_DTYPE = current_platform.fp8_dtype()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def bench_run(
 | 
				
			||||||
 | 
					    results: list,
 | 
				
			||||||
 | 
					    model: str,
 | 
				
			||||||
 | 
					    num_experts: int,
 | 
				
			||||||
 | 
					    topk: int,
 | 
				
			||||||
 | 
					    per_act_token: bool,
 | 
				
			||||||
 | 
					    per_out_ch: bool,
 | 
				
			||||||
 | 
					    mkn: tuple[int, int, int],
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    (m, k, n) = mkn
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    dtype = torch.half
 | 
				
			||||||
 | 
					    device = "cuda"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Create input activations
 | 
				
			||||||
 | 
					    a = torch.randn((m, k), device=device, dtype=dtype) / 10
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Create weights
 | 
				
			||||||
 | 
					    w1 = torch.randn((num_experts, 2 * n, k), device=device, dtype=dtype) / 10
 | 
				
			||||||
 | 
					    w2 = torch.randn((num_experts, k, n), device=device, dtype=dtype) / 10
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Create FP8 quantized weights and scales for both kernels
 | 
				
			||||||
 | 
					    w1_fp8q = torch.empty((num_experts, 2 * n, k), device=device, dtype=FP8_DTYPE)
 | 
				
			||||||
 | 
					    w2_fp8q = torch.empty((num_experts, k, n), device=device, dtype=FP8_DTYPE)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Create scales based on quantization strategy
 | 
				
			||||||
 | 
					    if per_out_ch:
 | 
				
			||||||
 | 
					        # Per-channel quantization
 | 
				
			||||||
 | 
					        w1_scale = torch.empty(
 | 
				
			||||||
 | 
					            (num_experts, 2 * n, 1), device=device, dtype=torch.float32
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        w2_scale = torch.empty((num_experts, k, 1), device=device, dtype=torch.float32)
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        # Per-tensor quantization
 | 
				
			||||||
 | 
					        w1_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
 | 
				
			||||||
 | 
					        w2_scale = torch.empty((num_experts, 1, 1), device=device, dtype=torch.float32)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Quantize weights
 | 
				
			||||||
 | 
					    for expert in range(num_experts):
 | 
				
			||||||
 | 
					        if per_out_ch:
 | 
				
			||||||
 | 
					            # Per-channel quantization - not yet implemented properly
 | 
				
			||||||
 | 
					            # For now, fall back to per-tensor quantization
 | 
				
			||||||
 | 
					            w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
 | 
				
			||||||
 | 
					            w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
 | 
				
			||||||
 | 
					            # Expand scalar scales to the expected per-channel shape
 | 
				
			||||||
 | 
					            w1_scale[expert] = w1_scale_temp.expand(2 * n, 1)
 | 
				
			||||||
 | 
					            w2_scale[expert] = w2_scale_temp.expand(k, 1)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            # Per-tensor quantization
 | 
				
			||||||
 | 
					            w1_fp8q[expert], w1_scale_temp = ops.scaled_fp8_quant(w1[expert])
 | 
				
			||||||
 | 
					            w2_fp8q[expert], w2_scale_temp = ops.scaled_fp8_quant(w2[expert])
 | 
				
			||||||
 | 
					            # Store scalar scales in [1, 1] tensors
 | 
				
			||||||
 | 
					            w1_scale[expert, 0, 0] = w1_scale_temp
 | 
				
			||||||
 | 
					            w2_scale[expert, 0, 0] = w2_scale_temp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Prepare weights for CUTLASS (no transpose needed)
 | 
				
			||||||
 | 
					    w1_fp8q_cutlass = w1_fp8q  # Keep original [E, 2N, K]
 | 
				
			||||||
 | 
					    w2_fp8q_cutlass = w2_fp8q  # Keep original [E, K, N]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Create router scores and get topk
 | 
				
			||||||
 | 
					    score = torch.randn((m, num_experts), device=device, dtype=dtype)
 | 
				
			||||||
 | 
					    topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # WORKAROUND: CUTLASS MoE FP8 has issues with per-token quantization
 | 
				
			||||||
 | 
					    # Force per-tensor quantization for all cases to match working e2e setup
 | 
				
			||||||
 | 
					    a1_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
 | 
				
			||||||
 | 
					    a2_scale = torch.full((), 1e-2, device=device, dtype=torch.float32)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Force per-tensor quantization for all cases
 | 
				
			||||||
 | 
					    per_act_token = False
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Create stride tensors for CUTLASS
 | 
				
			||||||
 | 
					    ab_strides1 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
 | 
				
			||||||
 | 
					    ab_strides2 = torch.full((num_experts,), n, dtype=torch.int64, device=device)
 | 
				
			||||||
 | 
					    c_strides1 = torch.full((num_experts,), 2 * n, dtype=torch.int64, device=device)
 | 
				
			||||||
 | 
					    c_strides2 = torch.full((num_experts,), k, dtype=torch.int64, device=device)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def run_triton_moe(
 | 
				
			||||||
 | 
					        a: torch.Tensor,
 | 
				
			||||||
 | 
					        w1: torch.Tensor,
 | 
				
			||||||
 | 
					        w2: torch.Tensor,
 | 
				
			||||||
 | 
					        topk_weights: torch.Tensor,
 | 
				
			||||||
 | 
					        topk_ids: torch.Tensor,
 | 
				
			||||||
 | 
					        w1_scale: torch.Tensor,
 | 
				
			||||||
 | 
					        w2_scale: torch.Tensor,
 | 
				
			||||||
 | 
					        a1_scale: torch.Tensor,
 | 
				
			||||||
 | 
					        a2_scale: torch.Tensor,
 | 
				
			||||||
 | 
					        num_repeats: int,
 | 
				
			||||||
 | 
					    ):
 | 
				
			||||||
 | 
					        quant_config = fp8_w8a8_moe_quant_config(
 | 
				
			||||||
 | 
					            w1_scale=w1_scale,
 | 
				
			||||||
 | 
					            w2_scale=w2_scale,
 | 
				
			||||||
 | 
					            a1_scale=a1_scale,
 | 
				
			||||||
 | 
					            a2_scale=a2_scale,
 | 
				
			||||||
 | 
					            per_act_token_quant=per_act_token,
 | 
				
			||||||
 | 
					            per_out_ch_quant=per_out_ch,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for _ in range(num_repeats):
 | 
				
			||||||
 | 
					            fused_experts(
 | 
				
			||||||
 | 
					                a,
 | 
				
			||||||
 | 
					                w1,
 | 
				
			||||||
 | 
					                w2,
 | 
				
			||||||
 | 
					                topk_weights,
 | 
				
			||||||
 | 
					                topk_ids,
 | 
				
			||||||
 | 
					                quant_config=quant_config,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def run_cutlass_moe_fp8(
 | 
				
			||||||
 | 
					        a: torch.Tensor,
 | 
				
			||||||
 | 
					        w1: torch.Tensor,
 | 
				
			||||||
 | 
					        w2: torch.Tensor,
 | 
				
			||||||
 | 
					        topk_weights: torch.Tensor,
 | 
				
			||||||
 | 
					        topk_ids: torch.Tensor,
 | 
				
			||||||
 | 
					        ab_strides1: torch.Tensor,
 | 
				
			||||||
 | 
					        ab_strides2: torch.Tensor,
 | 
				
			||||||
 | 
					        c_strides1: torch.Tensor,
 | 
				
			||||||
 | 
					        c_strides2: torch.Tensor,
 | 
				
			||||||
 | 
					        w1_scale: torch.Tensor,
 | 
				
			||||||
 | 
					        w2_scale: torch.Tensor,
 | 
				
			||||||
 | 
					        a1_scale: torch.Tensor,
 | 
				
			||||||
 | 
					        a2_scale: torch.Tensor,
 | 
				
			||||||
 | 
					        num_repeats: int,
 | 
				
			||||||
 | 
					    ):
 | 
				
			||||||
 | 
					        quant_config = fp8_w8a8_moe_quant_config(
 | 
				
			||||||
 | 
					            w1_scale=w1_scale,
 | 
				
			||||||
 | 
					            w2_scale=w2_scale,
 | 
				
			||||||
 | 
					            a1_scale=a1_scale,
 | 
				
			||||||
 | 
					            a2_scale=a2_scale,
 | 
				
			||||||
 | 
					            per_act_token_quant=per_act_token,
 | 
				
			||||||
 | 
					            per_out_ch_quant=per_out_ch,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for _ in range(num_repeats):
 | 
				
			||||||
 | 
					            with nvtx.annotate("cutlass_moe_fp8", color="blue"):
 | 
				
			||||||
 | 
					                cutlass_moe_fp8(
 | 
				
			||||||
 | 
					                    a=a,
 | 
				
			||||||
 | 
					                    w1_q=w1,
 | 
				
			||||||
 | 
					                    w2_q=w2,
 | 
				
			||||||
 | 
					                    topk_weights=topk_weights,
 | 
				
			||||||
 | 
					                    topk_ids=topk_ids,
 | 
				
			||||||
 | 
					                    ab_strides1=ab_strides1,
 | 
				
			||||||
 | 
					                    ab_strides2=ab_strides2,
 | 
				
			||||||
 | 
					                    c_strides1=c_strides1,
 | 
				
			||||||
 | 
					                    c_strides2=c_strides2,
 | 
				
			||||||
 | 
					                    quant_config=quant_config,
 | 
				
			||||||
 | 
					                    activation="silu",
 | 
				
			||||||
 | 
					                    global_num_experts=num_experts,
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Pre-create quantization config to avoid creating it inside CUDA graph
 | 
				
			||||||
 | 
					    quant_config = fp8_w8a8_moe_quant_config(
 | 
				
			||||||
 | 
					        w1_scale=w1_scale,
 | 
				
			||||||
 | 
					        w2_scale=w2_scale,
 | 
				
			||||||
 | 
					        a1_scale=a1_scale,
 | 
				
			||||||
 | 
					        a2_scale=a2_scale,
 | 
				
			||||||
 | 
					        per_act_token_quant=per_act_token,
 | 
				
			||||||
 | 
					        per_out_ch_quant=per_out_ch,
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Create CUDA graphs for CUTLASS (match benchmark_moe.py pattern exactly)
 | 
				
			||||||
 | 
					    cutlass_stream = torch.cuda.Stream()
 | 
				
			||||||
 | 
					    cutlass_graph = torch.cuda.CUDAGraph()
 | 
				
			||||||
 | 
					    with torch.cuda.graph(cutlass_graph, stream=cutlass_stream):
 | 
				
			||||||
 | 
					        # Capture 10 invocations like benchmark_moe.py
 | 
				
			||||||
 | 
					        for _ in range(10):
 | 
				
			||||||
 | 
					            cutlass_moe_fp8(
 | 
				
			||||||
 | 
					                a=a,
 | 
				
			||||||
 | 
					                w1_q=w1_fp8q_cutlass,
 | 
				
			||||||
 | 
					                w2_q=w2_fp8q_cutlass,
 | 
				
			||||||
 | 
					                topk_weights=topk_weights,
 | 
				
			||||||
 | 
					                topk_ids=topk_ids,
 | 
				
			||||||
 | 
					                ab_strides1=ab_strides1,
 | 
				
			||||||
 | 
					                ab_strides2=ab_strides2,
 | 
				
			||||||
 | 
					                c_strides1=c_strides1,
 | 
				
			||||||
 | 
					                c_strides2=c_strides2,
 | 
				
			||||||
 | 
					                quant_config=quant_config,
 | 
				
			||||||
 | 
					                activation="silu",
 | 
				
			||||||
 | 
					                global_num_experts=num_experts,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					    torch.cuda.synchronize()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Create CUDA graphs for Triton (match benchmark_moe.py pattern exactly)
 | 
				
			||||||
 | 
					    triton_stream = torch.cuda.Stream()
 | 
				
			||||||
 | 
					    triton_graph = torch.cuda.CUDAGraph()
 | 
				
			||||||
 | 
					    with torch.cuda.graph(triton_graph, stream=triton_stream):
 | 
				
			||||||
 | 
					        # Capture 10 invocations like benchmark_moe.py
 | 
				
			||||||
 | 
					        for _ in range(10):
 | 
				
			||||||
 | 
					            fused_experts(
 | 
				
			||||||
 | 
					                a,
 | 
				
			||||||
 | 
					                w1_fp8q,
 | 
				
			||||||
 | 
					                w2_fp8q,
 | 
				
			||||||
 | 
					                topk_weights,
 | 
				
			||||||
 | 
					                topk_ids,
 | 
				
			||||||
 | 
					                quant_config=quant_config,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					    torch.cuda.synchronize()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def bench_cuda_graph(graph, num_warmup=5, num_iters=100):
 | 
				
			||||||
 | 
					        """Benchmark CUDA graph using events like benchmark_moe.py"""
 | 
				
			||||||
 | 
					        # Warmup
 | 
				
			||||||
 | 
					        for _ in range(num_warmup):
 | 
				
			||||||
 | 
					            graph.replay()
 | 
				
			||||||
 | 
					        torch.cuda.synchronize()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Timing
 | 
				
			||||||
 | 
					        start_event = torch.cuda.Event(enable_timing=True)
 | 
				
			||||||
 | 
					        end_event = torch.cuda.Event(enable_timing=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        latencies = []
 | 
				
			||||||
 | 
					        for _ in range(num_iters):
 | 
				
			||||||
 | 
					            torch.cuda.synchronize()
 | 
				
			||||||
 | 
					            start_event.record()
 | 
				
			||||||
 | 
					            graph.replay()
 | 
				
			||||||
 | 
					            end_event.record()
 | 
				
			||||||
 | 
					            end_event.synchronize()
 | 
				
			||||||
 | 
					            latencies.append(start_event.elapsed_time(end_event))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Divide by 10 since graph contains 10 calls
 | 
				
			||||||
 | 
					        return sum(latencies) / (num_iters * 10)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Benchmark parameters
 | 
				
			||||||
 | 
					    num_warmup = 5
 | 
				
			||||||
 | 
					    num_iters = 100
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Benchmark only CUDA graphs (more reliable and faster)
 | 
				
			||||||
 | 
					    # Benchmark Triton MoE with CUDA graphs
 | 
				
			||||||
 | 
					    triton_graph_time = bench_cuda_graph(
 | 
				
			||||||
 | 
					        triton_graph, num_warmup=num_warmup, num_iters=num_iters
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Benchmark CUTLASS MoE with CUDA graphs
 | 
				
			||||||
 | 
					    cutlass_graph_time = bench_cuda_graph(
 | 
				
			||||||
 | 
					        cutlass_graph, num_warmup=num_warmup, num_iters=num_iters
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Convert ms to us and return results
 | 
				
			||||||
 | 
					    triton_time_us = triton_graph_time * 1000
 | 
				
			||||||
 | 
					    cutlass_time_us = cutlass_graph_time * 1000
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return {
 | 
				
			||||||
 | 
					        "batch_size": m,
 | 
				
			||||||
 | 
					        "triton_time_us": triton_time_us,
 | 
				
			||||||
 | 
					        "cutlass_time_us": cutlass_time_us,
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def main(args):
 | 
				
			||||||
 | 
					    print("Benchmarking models:")
 | 
				
			||||||
 | 
					    for i, model in enumerate(args.models):
 | 
				
			||||||
 | 
					        print(f"[{i}]  {model}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    all_results = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for model in args.models:
 | 
				
			||||||
 | 
					        for tp in args.tp_sizes:
 | 
				
			||||||
 | 
					            for layer in WEIGHT_SHAPES_MOE[model]:
 | 
				
			||||||
 | 
					                num_experts = layer[0]
 | 
				
			||||||
 | 
					                topk = layer[1]
 | 
				
			||||||
 | 
					                size_k = layer[2]
 | 
				
			||||||
 | 
					                size_n = layer[3] // tp
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                if len(args.limit_k) > 0 and size_k not in args.limit_k:
 | 
				
			||||||
 | 
					                    continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                if len(args.limit_n) > 0 and size_n not in args.limit_n:
 | 
				
			||||||
 | 
					                    continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                for per_act_token in args.per_act_token_opts:
 | 
				
			||||||
 | 
					                    for per_out_ch in args.per_out_ch_opts:
 | 
				
			||||||
 | 
					                        print(
 | 
				
			||||||
 | 
					                            f"\n=== {model}, experts={num_experts}, topk={topk},"
 | 
				
			||||||
 | 
					                            f"per_act={per_act_token}, per_out_ch={per_out_ch} ==="
 | 
				
			||||||
 | 
					                        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                        config_results = []
 | 
				
			||||||
 | 
					                        for size_m in args.batch_sizes:
 | 
				
			||||||
 | 
					                            mkn = (size_m, size_k, size_n)
 | 
				
			||||||
 | 
					                            result = bench_run(
 | 
				
			||||||
 | 
					                                [],  # Not used anymore
 | 
				
			||||||
 | 
					                                model,
 | 
				
			||||||
 | 
					                                num_experts,
 | 
				
			||||||
 | 
					                                topk,
 | 
				
			||||||
 | 
					                                per_act_token,
 | 
				
			||||||
 | 
					                                per_out_ch,
 | 
				
			||||||
 | 
					                                mkn,
 | 
				
			||||||
 | 
					                            )
 | 
				
			||||||
 | 
					                            if result:
 | 
				
			||||||
 | 
					                                config_results.append(result)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                        # Print results table for this configuration
 | 
				
			||||||
 | 
					                        if config_results:
 | 
				
			||||||
 | 
					                            print(
 | 
				
			||||||
 | 
					                                f"\n{'Batch Size':<12}"
 | 
				
			||||||
 | 
					                                f"{'Triton (us)':<15}"
 | 
				
			||||||
 | 
					                                f"{'CUTLASS (us)':<15}"
 | 
				
			||||||
 | 
					                            )
 | 
				
			||||||
 | 
					                            print("-" * 45)
 | 
				
			||||||
 | 
					                            for result in config_results:
 | 
				
			||||||
 | 
					                                print(
 | 
				
			||||||
 | 
					                                    f"{result['batch_size']:<12}"
 | 
				
			||||||
 | 
					                                    f"{result['triton_time_us']:<15.2f}"
 | 
				
			||||||
 | 
					                                    f"{result['cutlass_time_us']:<15.2f}"
 | 
				
			||||||
 | 
					                                )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					                            all_results.extend(config_results)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    print(f"\nTotal benchmarks completed: {len(all_results)}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == "__main__":
 | 
				
			||||||
 | 
					    parser = FlexibleArgumentParser(
 | 
				
			||||||
 | 
					        description="""Benchmark CUTLASS FP8 MOE vs Triton FP8 FUSED MOE
 | 
				
			||||||
 | 
					         across specified models/shapes/batches
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        Example usage:
 | 
				
			||||||
 | 
					        python benchmark_cutlass_moe_fp8.py  \
 | 
				
			||||||
 | 
					            --model "Llama-4-Maverick-17B-128E-Instruct-FP8"  \
 | 
				
			||||||
 | 
					            --tp-sizes 8 \
 | 
				
			||||||
 | 
					            --batch-size 2 4 8  \
 | 
				
			||||||
 | 
					            --per-act-token-opts false \
 | 
				
			||||||
 | 
					            --per-out-ch-opts false
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    parser.add_argument(
 | 
				
			||||||
 | 
					        "--models",
 | 
				
			||||||
 | 
					        nargs="+",
 | 
				
			||||||
 | 
					        type=str,
 | 
				
			||||||
 | 
					        default=DEFAULT_MODELS,
 | 
				
			||||||
 | 
					        choices=WEIGHT_SHAPES_MOE.keys(),
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    parser.add_argument("--tp-sizes", nargs="+", type=int, default=DEFAULT_TP_SIZES)
 | 
				
			||||||
 | 
					    parser.add_argument(
 | 
				
			||||||
 | 
					        "--batch-sizes", nargs="+", type=int, default=DEFAULT_BATCH_SIZES
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    parser.add_argument("--limit-k", nargs="+", type=int, default=[])
 | 
				
			||||||
 | 
					    parser.add_argument("--limit-n", nargs="+", type=int, default=[])
 | 
				
			||||||
 | 
					    parser.add_argument(
 | 
				
			||||||
 | 
					        "--per-act-token-opts",
 | 
				
			||||||
 | 
					        nargs="+",
 | 
				
			||||||
 | 
					        type=lambda x: x.lower() == "true",
 | 
				
			||||||
 | 
					        default=[False, True],
 | 
				
			||||||
 | 
					        help="Per-activation token quantization options (true/false)",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    parser.add_argument(
 | 
				
			||||||
 | 
					        "--per-out-ch-opts",
 | 
				
			||||||
 | 
					        nargs="+",
 | 
				
			||||||
 | 
					        type=lambda x: x.lower() == "true",
 | 
				
			||||||
 | 
					        default=[False, True],
 | 
				
			||||||
 | 
					        help="Per-output channel quantization options (true/false)",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    args = parser.parse_args()
 | 
				
			||||||
 | 
					    main(args)
 | 
				
			||||||
@ -7,6 +7,10 @@ Benchmark script for device communicators:
 | 
				
			|||||||
CustomAllreduce (oneshot, twoshot), PyNcclCommunicator,
 | 
					CustomAllreduce (oneshot, twoshot), PyNcclCommunicator,
 | 
				
			||||||
and SymmMemCommunicator (multimem, two-shot).
 | 
					and SymmMemCommunicator (multimem, two-shot).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					for NCCL symmetric memory you need to set the environment variables
 | 
				
			||||||
 | 
					NCCL_NVLS_ENABLE=1 NCCL_CUMEM_ENABLE=1 VLLM_USE_NCCL_SYMM_MEM=1, otherwise NCCL does
 | 
				
			||||||
 | 
					not use fast NVLS implementation for all reduce.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Usage:
 | 
					Usage:
 | 
				
			||||||
    torchrun --nproc_per_node=<N> benchmark_device_communicators.py [options]
 | 
					    torchrun --nproc_per_node=<N> benchmark_device_communicators.py [options]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -18,15 +22,21 @@ Example:
 | 
				
			|||||||
import json
 | 
					import json
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
import time
 | 
					import time
 | 
				
			||||||
 | 
					from collections.abc import Callable
 | 
				
			||||||
from contextlib import nullcontext
 | 
					from contextlib import nullcontext
 | 
				
			||||||
from typing import Callable, Optional
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
import torch
 | 
					import torch
 | 
				
			||||||
import torch.distributed as dist
 | 
					import torch.distributed as dist
 | 
				
			||||||
from torch.distributed import ProcessGroup
 | 
					from torch.distributed import ProcessGroup
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
 | 
					from vllm.distributed.device_communicators.custom_all_reduce import CustomAllreduce
 | 
				
			||||||
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
 | 
					from vllm.distributed.device_communicators.pynccl import (
 | 
				
			||||||
 | 
					    PyNcclCommunicator,
 | 
				
			||||||
 | 
					    register_nccl_symmetric_ops,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					from vllm.distributed.device_communicators.pynccl_allocator import (
 | 
				
			||||||
 | 
					    set_graph_pool_id,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
 | 
					from vllm.distributed.device_communicators.symm_mem import SymmMemCommunicator
 | 
				
			||||||
from vllm.logger import init_logger
 | 
					from vllm.logger import init_logger
 | 
				
			||||||
from vllm.utils import FlexibleArgumentParser
 | 
					from vllm.utils import FlexibleArgumentParser
 | 
				
			||||||
@ -98,6 +108,7 @@ class CommunicatorBenchmark:
 | 
				
			|||||||
            )
 | 
					            )
 | 
				
			||||||
            if not self.pynccl_comm.disabled:
 | 
					            if not self.pynccl_comm.disabled:
 | 
				
			||||||
                logger.info("Rank %s: PyNcclCommunicator initialized", self.rank)
 | 
					                logger.info("Rank %s: PyNcclCommunicator initialized", self.rank)
 | 
				
			||||||
 | 
					                register_nccl_symmetric_ops(self.pynccl_comm)
 | 
				
			||||||
            else:
 | 
					            else:
 | 
				
			||||||
                logger.info("Rank %s: PyNcclCommunicator disabled", self.rank)
 | 
					                logger.info("Rank %s: PyNcclCommunicator disabled", self.rank)
 | 
				
			||||||
                self.pynccl_comm = None
 | 
					                self.pynccl_comm = None
 | 
				
			||||||
@ -194,6 +205,15 @@ class CommunicatorBenchmark:
 | 
				
			|||||||
                    None,  # no env variable needed
 | 
					                    None,  # no env variable needed
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					            communicators.append(
 | 
				
			||||||
 | 
					                (
 | 
				
			||||||
 | 
					                    "pynccl-symm",
 | 
				
			||||||
 | 
					                    lambda t: torch.ops.vllm.all_reduce_symmetric_with_copy(t),
 | 
				
			||||||
 | 
					                    lambda t: True,  # Always available if initialized
 | 
				
			||||||
 | 
					                    nullcontext(),
 | 
				
			||||||
 | 
					                    None,  # no env variable needed
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if self.symm_mem_comm_multimem is not None:
 | 
					        if self.symm_mem_comm_multimem is not None:
 | 
				
			||||||
            comm = self.symm_mem_comm_multimem
 | 
					            comm = self.symm_mem_comm_multimem
 | 
				
			||||||
@ -244,12 +264,12 @@ class CommunicatorBenchmark:
 | 
				
			|||||||
    def benchmark_allreduce_single(
 | 
					    def benchmark_allreduce_single(
 | 
				
			||||||
        self,
 | 
					        self,
 | 
				
			||||||
        sequence_length: int,
 | 
					        sequence_length: int,
 | 
				
			||||||
        allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]],
 | 
					        allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None],
 | 
				
			||||||
        should_use_fn: Callable[[torch.Tensor], bool],
 | 
					        should_use_fn: Callable[[torch.Tensor], bool],
 | 
				
			||||||
        context,
 | 
					        context,
 | 
				
			||||||
        num_warmup: int,
 | 
					        num_warmup: int,
 | 
				
			||||||
        num_trials: int,
 | 
					        num_trials: int,
 | 
				
			||||||
    ) -> Optional[float]:
 | 
					    ) -> float | None:
 | 
				
			||||||
        """Benchmark method with CUDA graph optimization."""
 | 
					        """Benchmark method with CUDA graph optimization."""
 | 
				
			||||||
        try:
 | 
					        try:
 | 
				
			||||||
            # Create test tensor (2D: sequence_length x hidden_size)
 | 
					            # Create test tensor (2D: sequence_length x hidden_size)
 | 
				
			||||||
@ -271,7 +291,9 @@ class CommunicatorBenchmark:
 | 
				
			|||||||
                # Capture the graph using context manager
 | 
					                # Capture the graph using context manager
 | 
				
			||||||
                with context:
 | 
					                with context:
 | 
				
			||||||
                    graph = torch.cuda.CUDAGraph()
 | 
					                    graph = torch.cuda.CUDAGraph()
 | 
				
			||||||
                    with torch.cuda.graph(graph):
 | 
					                    graph_pool = torch.cuda.graph_pool_handle()
 | 
				
			||||||
 | 
					                    set_graph_pool_id(graph_pool)
 | 
				
			||||||
 | 
					                    with torch.cuda.graph(graph, pool=graph_pool):
 | 
				
			||||||
                        for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
 | 
					                        for _ in range(CUDA_GRAPH_CAPTURE_CYCLES):
 | 
				
			||||||
                            allreduce_fn(graph_input)
 | 
					                            allreduce_fn(graph_input)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -7,6 +7,7 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
from vllm import _custom_ops as ops
 | 
					from vllm import _custom_ops as ops
 | 
				
			||||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 | 
					from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 | 
				
			||||||
 | 
					from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
 | 
				
			||||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
 | 
					from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
 | 
				
			||||||
from vllm.model_executor.layers.fused_moe.fused_moe import (
 | 
					from vllm.model_executor.layers.fused_moe.fused_moe import (
 | 
				
			||||||
    fused_experts,
 | 
					    fused_experts,
 | 
				
			||||||
@ -96,6 +97,11 @@ def bench_run(
 | 
				
			|||||||
        a_scale: torch.Tensor,
 | 
					        a_scale: torch.Tensor,
 | 
				
			||||||
        num_repeats: int,
 | 
					        num_repeats: int,
 | 
				
			||||||
    ):
 | 
					    ):
 | 
				
			||||||
 | 
					        quant_config = fp8_w8a8_moe_quant_config(
 | 
				
			||||||
 | 
					            w1_scale=w1_scale,
 | 
				
			||||||
 | 
					            w2_scale=w2_scale,
 | 
				
			||||||
 | 
					            a1_scale=a_scale,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
        for _ in range(num_repeats):
 | 
					        for _ in range(num_repeats):
 | 
				
			||||||
            fused_experts(
 | 
					            fused_experts(
 | 
				
			||||||
                a,
 | 
					                a,
 | 
				
			||||||
@ -103,10 +109,7 @@ def bench_run(
 | 
				
			|||||||
                w2,
 | 
					                w2,
 | 
				
			||||||
                topk_weights,
 | 
					                topk_weights,
 | 
				
			||||||
                topk_ids,
 | 
					                topk_ids,
 | 
				
			||||||
                use_fp8_w8a8=True,
 | 
					                quant_config=quant_config,
 | 
				
			||||||
                w1_scale=w1_scale,
 | 
					 | 
				
			||||||
                w2_scale=w2_scale,
 | 
					 | 
				
			||||||
                a1_scale=a_scale,
 | 
					 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def run_cutlass_moe(
 | 
					    def run_cutlass_moe(
 | 
				
			||||||
@ -125,6 +128,12 @@ def bench_run(
 | 
				
			|||||||
        per_act_token: bool,
 | 
					        per_act_token: bool,
 | 
				
			||||||
        num_repeats: int,
 | 
					        num_repeats: int,
 | 
				
			||||||
    ):
 | 
					    ):
 | 
				
			||||||
 | 
					        quant_config = fp8_w8a8_moe_quant_config(
 | 
				
			||||||
 | 
					            w1_scale=w1_scale,
 | 
				
			||||||
 | 
					            w2_scale=w2_scale,
 | 
				
			||||||
 | 
					            per_act_token_quant=per_act_token,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        for _ in range(num_repeats):
 | 
					        for _ in range(num_repeats):
 | 
				
			||||||
            cutlass_moe_fp8(
 | 
					            cutlass_moe_fp8(
 | 
				
			||||||
                a,
 | 
					                a,
 | 
				
			||||||
@ -132,14 +141,11 @@ def bench_run(
 | 
				
			|||||||
                w2,
 | 
					                w2,
 | 
				
			||||||
                topk_weights,
 | 
					                topk_weights,
 | 
				
			||||||
                topk_ids,
 | 
					                topk_ids,
 | 
				
			||||||
                w1_scale,
 | 
					 | 
				
			||||||
                w2_scale,
 | 
					 | 
				
			||||||
                ab_strides1,
 | 
					                ab_strides1,
 | 
				
			||||||
                ab_strides2,
 | 
					                ab_strides2,
 | 
				
			||||||
                c_strides1,
 | 
					                c_strides1,
 | 
				
			||||||
                c_strides2,
 | 
					                c_strides2,
 | 
				
			||||||
                per_act_token,
 | 
					                quant_config=quant_config,
 | 
				
			||||||
                a1_scale=None,
 | 
					 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def run_cutlass_from_graph(
 | 
					    def run_cutlass_from_graph(
 | 
				
			||||||
@ -156,6 +162,12 @@ def bench_run(
 | 
				
			|||||||
        topk_weights: torch.Tensor,
 | 
					        topk_weights: torch.Tensor,
 | 
				
			||||||
        topk_ids: torch.Tensor,
 | 
					        topk_ids: torch.Tensor,
 | 
				
			||||||
    ):
 | 
					    ):
 | 
				
			||||||
 | 
					        quant_config = fp8_w8a8_moe_quant_config(
 | 
				
			||||||
 | 
					            w1_scale=w1_scale,
 | 
				
			||||||
 | 
					            w2_scale=w2_scale,
 | 
				
			||||||
 | 
					            per_act_token_quant=per_act_token,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        with set_current_vllm_config(
 | 
					        with set_current_vllm_config(
 | 
				
			||||||
            VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
 | 
					            VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
 | 
				
			||||||
        ):
 | 
					        ):
 | 
				
			||||||
@ -165,14 +177,11 @@ def bench_run(
 | 
				
			|||||||
                w2_q,
 | 
					                w2_q,
 | 
				
			||||||
                topk_weights,
 | 
					                topk_weights,
 | 
				
			||||||
                topk_ids,
 | 
					                topk_ids,
 | 
				
			||||||
                w1_scale,
 | 
					 | 
				
			||||||
                w2_scale,
 | 
					 | 
				
			||||||
                ab_strides1,
 | 
					                ab_strides1,
 | 
				
			||||||
                ab_strides2,
 | 
					                ab_strides2,
 | 
				
			||||||
                c_strides1,
 | 
					                c_strides1,
 | 
				
			||||||
                c_strides2,
 | 
					                c_strides2,
 | 
				
			||||||
                per_act_token,
 | 
					                quant_config=quant_config,
 | 
				
			||||||
                a1_scale=None,
 | 
					 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def run_triton_from_graph(
 | 
					    def run_triton_from_graph(
 | 
				
			||||||
@ -185,6 +194,11 @@ def bench_run(
 | 
				
			|||||||
        w2_scale: torch.Tensor,
 | 
					        w2_scale: torch.Tensor,
 | 
				
			||||||
        a_scale: torch.Tensor,
 | 
					        a_scale: torch.Tensor,
 | 
				
			||||||
    ):
 | 
					    ):
 | 
				
			||||||
 | 
					        quant_config = fp8_w8a8_moe_quant_config(
 | 
				
			||||||
 | 
					            w1_scale=w1_scale,
 | 
				
			||||||
 | 
					            w2_scale=w2_scale,
 | 
				
			||||||
 | 
					            a1_scale=a_scale,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
        with set_current_vllm_config(
 | 
					        with set_current_vllm_config(
 | 
				
			||||||
            VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
 | 
					            VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
 | 
				
			||||||
        ):
 | 
					        ):
 | 
				
			||||||
@ -194,10 +208,7 @@ def bench_run(
 | 
				
			|||||||
                w2,
 | 
					                w2,
 | 
				
			||||||
                topk_weights,
 | 
					                topk_weights,
 | 
				
			||||||
                topk_ids,
 | 
					                topk_ids,
 | 
				
			||||||
                use_fp8_w8a8=True,
 | 
					                quant_config=quant_config,
 | 
				
			||||||
                w1_scale=w1_scale,
 | 
					 | 
				
			||||||
                w2_scale=w2_scale,
 | 
					 | 
				
			||||||
                a1_scale=a_scale,
 | 
					 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def replay_graph(graph, num_repeats):
 | 
					    def replay_graph(graph, num_repeats):
 | 
				
			||||||
 | 
				
			|||||||
@ -7,7 +7,8 @@ import torch
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
from vllm.model_executor.layers.layernorm import RMSNorm
 | 
					from vllm.model_executor.layers.layernorm import RMSNorm
 | 
				
			||||||
from vllm.platforms import current_platform
 | 
					from vllm.platforms import current_platform
 | 
				
			||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
 | 
					from vllm.utils import FlexibleArgumentParser
 | 
				
			||||||
 | 
					from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@torch.inference_mode()
 | 
					@torch.inference_mode()
 | 
				
			||||||
 | 
				
			|||||||
@ -6,11 +6,12 @@ import copy
 | 
				
			|||||||
import json
 | 
					import json
 | 
				
			||||||
import pickle
 | 
					import pickle
 | 
				
			||||||
import time
 | 
					import time
 | 
				
			||||||
 | 
					from collections.abc import Callable
 | 
				
			||||||
from dataclasses import dataclass
 | 
					from dataclasses import dataclass
 | 
				
			||||||
from enum import Enum, auto
 | 
					from enum import Enum, auto
 | 
				
			||||||
from itertools import product
 | 
					from itertools import product
 | 
				
			||||||
from pathlib import Path
 | 
					from pathlib import Path
 | 
				
			||||||
from typing import Any, Callable, Optional
 | 
					from typing import Any
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import torch
 | 
					import torch
 | 
				
			||||||
import torch.utils.benchmark as TBenchmark
 | 
					import torch.utils.benchmark as TBenchmark
 | 
				
			||||||
@ -79,9 +80,9 @@ def make_rand_lora_weight_tensor(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def make_rand_tensors(
 | 
					def make_rand_tensors(
 | 
				
			||||||
    a_shape: tuple[int],
 | 
					    a_shape: tuple[int, ...],
 | 
				
			||||||
    b_shape: tuple[int],
 | 
					    b_shape: tuple[int, ...],
 | 
				
			||||||
    c_shape: tuple[int],
 | 
					    c_shape: tuple[int, ...],
 | 
				
			||||||
    a_dtype: torch.dtype,
 | 
					    a_dtype: torch.dtype,
 | 
				
			||||||
    b_dtype: torch.dtype,
 | 
					    b_dtype: torch.dtype,
 | 
				
			||||||
    c_dtype: torch.dtype,
 | 
					    c_dtype: torch.dtype,
 | 
				
			||||||
@ -158,7 +159,7 @@ def ref_group_gemm(
 | 
				
			|||||||
    seq_lens_cpu: torch.Tensor,
 | 
					    seq_lens_cpu: torch.Tensor,
 | 
				
			||||||
    prompt_lora_mapping_cpu: torch.Tensor,
 | 
					    prompt_lora_mapping_cpu: torch.Tensor,
 | 
				
			||||||
    scaling: float,
 | 
					    scaling: float,
 | 
				
			||||||
    add_inputs: Optional[bool],
 | 
					    add_inputs: bool | None,
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Torch group gemm reference implementation to test correctness of
 | 
					    Torch group gemm reference implementation to test correctness of
 | 
				
			||||||
@ -243,7 +244,7 @@ class OpType(Enum):
 | 
				
			|||||||
        lora_rank: int,
 | 
					        lora_rank: int,
 | 
				
			||||||
        num_loras: int,
 | 
					        num_loras: int,
 | 
				
			||||||
        num_slices: int,
 | 
					        num_slices: int,
 | 
				
			||||||
    ) -> tuple[tuple[int], tuple[int], tuple[int]]:
 | 
					    ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...]]:
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        Given num_slices, return the shapes of the A, B, and C matrices
 | 
					        Given num_slices, return the shapes of the A, B, and C matrices
 | 
				
			||||||
        in A x B = C, for the op_type
 | 
					        in A x B = C, for the op_type
 | 
				
			||||||
@ -316,8 +317,8 @@ class BenchmarkContext:
 | 
				
			|||||||
    lora_rank: int
 | 
					    lora_rank: int
 | 
				
			||||||
    sort_by_lora_id: bool
 | 
					    sort_by_lora_id: bool
 | 
				
			||||||
    dtype: torch.dtype
 | 
					    dtype: torch.dtype
 | 
				
			||||||
    seq_length: Optional[int] = None
 | 
					    seq_length: int | None = None
 | 
				
			||||||
    num_slices: Optional[int] = None  # num_slices for slice based ops
 | 
					    num_slices: int | None = None  # num_slices for slice based ops
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
 | 
					    def with_seq_length(self, seq_length: int) -> "BenchmarkContext":
 | 
				
			||||||
        ctx = copy.copy(self)
 | 
					        ctx = copy.copy(self)
 | 
				
			||||||
@ -464,7 +465,11 @@ class BenchmarkTensors:
 | 
				
			|||||||
        for field_name in LoRAKernelMeta.__dataclass_fields__:
 | 
					        for field_name in LoRAKernelMeta.__dataclass_fields__:
 | 
				
			||||||
            field = getattr(self.lora_kernel_meta, field_name)
 | 
					            field = getattr(self.lora_kernel_meta, field_name)
 | 
				
			||||||
            assert isinstance(field, torch.Tensor)
 | 
					            assert isinstance(field, torch.Tensor)
 | 
				
			||||||
            setattr(self.lora_kernel_meta, field_name, to_device(field))
 | 
					            setattr(
 | 
				
			||||||
 | 
					                self.lora_kernel_meta,
 | 
				
			||||||
 | 
					                field_name,
 | 
				
			||||||
 | 
					                to_device(field) if field_name != "no_lora_flag_cpu" else field,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def metadata(self) -> tuple[int, int, int]:
 | 
					    def metadata(self) -> tuple[int, int, int]:
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
@ -512,6 +517,7 @@ class BenchmarkTensors:
 | 
				
			|||||||
            "lora_token_start_loc": self.lora_kernel_meta.lora_token_start_loc,
 | 
					            "lora_token_start_loc": self.lora_kernel_meta.lora_token_start_loc,
 | 
				
			||||||
            "lora_ids": self.lora_kernel_meta.active_lora_ids,
 | 
					            "lora_ids": self.lora_kernel_meta.active_lora_ids,
 | 
				
			||||||
            "scaling": 1.0,
 | 
					            "scaling": 1.0,
 | 
				
			||||||
 | 
					            "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
 | 
					    def as_lora_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
 | 
				
			||||||
@ -552,10 +558,11 @@ class BenchmarkTensors:
 | 
				
			|||||||
            "lora_ids": self.lora_kernel_meta.active_lora_ids,
 | 
					            "lora_ids": self.lora_kernel_meta.active_lora_ids,
 | 
				
			||||||
            "offset_start": 0,
 | 
					            "offset_start": 0,
 | 
				
			||||||
            "add_inputs": add_inputs,
 | 
					            "add_inputs": add_inputs,
 | 
				
			||||||
 | 
					            "no_lora_flag_cpu": self.lora_kernel_meta.no_lora_flag_cpu,
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def bench_fn_kwargs(
 | 
					    def bench_fn_kwargs(
 | 
				
			||||||
        self, op_type: OpType, add_inputs: Optional[bool] = None
 | 
					        self, op_type: OpType, add_inputs: bool | None = None
 | 
				
			||||||
    ) -> dict[str, Any]:
 | 
					    ) -> dict[str, Any]:
 | 
				
			||||||
        if op_type.is_shrink_fn():
 | 
					        if op_type.is_shrink_fn():
 | 
				
			||||||
            assert add_inputs is None
 | 
					            assert add_inputs is None
 | 
				
			||||||
@ -569,7 +576,7 @@ class BenchmarkTensors:
 | 
				
			|||||||
        raise ValueError(f"Unrecognized optype {self}")
 | 
					        raise ValueError(f"Unrecognized optype {self}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def test_correctness(
 | 
					    def test_correctness(
 | 
				
			||||||
        self, op_type: OpType, expand_fn_add_inputs: Optional[bool]
 | 
					        self, op_type: OpType, expand_fn_add_inputs: bool | None
 | 
				
			||||||
    ) -> bool:
 | 
					    ) -> bool:
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        Test correctness of op_type implementation against a grouped gemm
 | 
					        Test correctness of op_type implementation against a grouped gemm
 | 
				
			||||||
@ -605,8 +612,8 @@ def bench_optype(
 | 
				
			|||||||
    ctx: BenchmarkContext,
 | 
					    ctx: BenchmarkContext,
 | 
				
			||||||
    arg_pool_size: int,
 | 
					    arg_pool_size: int,
 | 
				
			||||||
    op_type: OpType,
 | 
					    op_type: OpType,
 | 
				
			||||||
    cuda_graph_nops: Optional[int] = None,
 | 
					    cuda_graph_nops: int | None = None,
 | 
				
			||||||
    expand_fn_add_inputs: Optional[bool] = None,
 | 
					    expand_fn_add_inputs: bool | None = None,
 | 
				
			||||||
    test_correctness: bool = False,
 | 
					    test_correctness: bool = False,
 | 
				
			||||||
) -> TMeasurement:
 | 
					) -> TMeasurement:
 | 
				
			||||||
    assert arg_pool_size >= 1
 | 
					    assert arg_pool_size >= 1
 | 
				
			||||||
@ -673,7 +680,7 @@ def bench_torch_mm(
 | 
				
			|||||||
    ctx: BenchmarkContext,
 | 
					    ctx: BenchmarkContext,
 | 
				
			||||||
    arg_pool_size: int,
 | 
					    arg_pool_size: int,
 | 
				
			||||||
    op_type: OpType,
 | 
					    op_type: OpType,
 | 
				
			||||||
    cuda_graph_nops: Optional[int] = None,
 | 
					    cuda_graph_nops: int | None = None,
 | 
				
			||||||
) -> TMeasurement:
 | 
					) -> TMeasurement:
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Benchmark basic torch.mm as a roofline.
 | 
					    Benchmark basic torch.mm as a roofline.
 | 
				
			||||||
@ -738,7 +745,7 @@ def use_cuda_graph_recommendation() -> str:
 | 
				
			|||||||
            """
 | 
					            """
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None):
 | 
					def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None):
 | 
				
			||||||
    compare = TBenchmark.Compare(timers)
 | 
					    compare = TBenchmark.Compare(timers)
 | 
				
			||||||
    compare.print()
 | 
					    compare.print()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -8,10 +8,9 @@ import math
 | 
				
			|||||||
import os
 | 
					import os
 | 
				
			||||||
import pickle as pkl
 | 
					import pickle as pkl
 | 
				
			||||||
import time
 | 
					import time
 | 
				
			||||||
from collections.abc import Iterable
 | 
					from collections.abc import Callable, Iterable
 | 
				
			||||||
from dataclasses import dataclass
 | 
					from dataclasses import dataclass
 | 
				
			||||||
from itertools import product
 | 
					from itertools import product
 | 
				
			||||||
from typing import Callable, Optional
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pandas as pd
 | 
					import pandas as pd
 | 
				
			||||||
import torch
 | 
					import torch
 | 
				
			||||||
@ -63,23 +62,23 @@ class BenchmarkTensors:
 | 
				
			|||||||
    a: torch.Tensor
 | 
					    a: torch.Tensor
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    w_q: torch.Tensor
 | 
					    w_q: torch.Tensor
 | 
				
			||||||
    group_size: Optional[int]
 | 
					    group_size: int | None
 | 
				
			||||||
    wtype: ScalarType
 | 
					    wtype: ScalarType
 | 
				
			||||||
    w_g_s: torch.Tensor
 | 
					    w_g_s: torch.Tensor
 | 
				
			||||||
    w_g_zp: Optional[torch.Tensor]
 | 
					    w_g_zp: torch.Tensor | None
 | 
				
			||||||
    w_ch_s: Optional[torch.Tensor]
 | 
					    w_ch_s: torch.Tensor | None
 | 
				
			||||||
    w_tok_s: Optional[torch.Tensor]
 | 
					    w_tok_s: torch.Tensor | None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@dataclass
 | 
					@dataclass
 | 
				
			||||||
class TypeConfig:
 | 
					class TypeConfig:
 | 
				
			||||||
    act_type: torch.dtype
 | 
					    act_type: torch.dtype
 | 
				
			||||||
    weight_type: ScalarType
 | 
					    weight_type: ScalarType
 | 
				
			||||||
    output_type: Optional[torch.dtype]
 | 
					    output_type: torch.dtype | None
 | 
				
			||||||
    group_scale_type: Optional[torch.dtype]
 | 
					    group_scale_type: torch.dtype | None
 | 
				
			||||||
    group_zero_type: Optional[torch.dtype]
 | 
					    group_zero_type: torch.dtype | None
 | 
				
			||||||
    channel_scale_type: Optional[torch.dtype]
 | 
					    channel_scale_type: torch.dtype | None
 | 
				
			||||||
    token_scale_type: Optional[torch.dtype]
 | 
					    token_scale_type: torch.dtype | None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def rand_data(shape, dtype=torch.float16, scale=1):
 | 
					def rand_data(shape, dtype=torch.float16, scale=1):
 | 
				
			||||||
@ -93,8 +92,8 @@ def quantize_and_pack(
 | 
				
			|||||||
    atype: torch.dtype,
 | 
					    atype: torch.dtype,
 | 
				
			||||||
    w: torch.Tensor,
 | 
					    w: torch.Tensor,
 | 
				
			||||||
    wtype: ScalarType,
 | 
					    wtype: ScalarType,
 | 
				
			||||||
    stype: Optional[torch.dtype],
 | 
					    stype: torch.dtype | None,
 | 
				
			||||||
    group_size: Optional[int],
 | 
					    group_size: int | None,
 | 
				
			||||||
    zero_points: bool = False,
 | 
					    zero_points: bool = False,
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    assert wtype.is_integer(), "TODO: support floating point weights"
 | 
					    assert wtype.is_integer(), "TODO: support floating point weights"
 | 
				
			||||||
@ -113,7 +112,7 @@ def quantize_and_pack(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def create_bench_tensors(
 | 
					def create_bench_tensors(
 | 
				
			||||||
    shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int]
 | 
					    shape: tuple[int, int, int], types: TypeConfig, group_size: int | None
 | 
				
			||||||
) -> list[BenchmarkTensors]:
 | 
					) -> list[BenchmarkTensors]:
 | 
				
			||||||
    m, n, k = shape
 | 
					    m, n, k = shape
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -331,8 +330,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable])
 | 
				
			|||||||
    return res
 | 
					    return res
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
_SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None
 | 
					_SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None
 | 
				
			||||||
_SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None
 | 
					_SWEEP_SCHEDULES_RESULTS_CSV: str | None = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def bench(
 | 
					def bench(
 | 
				
			||||||
 | 
				
			|||||||
@ -14,6 +14,10 @@ import ray
 | 
				
			|||||||
import torch
 | 
					import torch
 | 
				
			||||||
from ray.experimental.tqdm_ray import tqdm
 | 
					from ray.experimental.tqdm_ray import tqdm
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from vllm.model_executor.layers.fused_moe.config import (
 | 
				
			||||||
 | 
					    FusedMoEQuantConfig,
 | 
				
			||||||
 | 
					    _get_config_dtype_str,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
from vllm.model_executor.layers.fused_moe.fused_moe import *
 | 
					from vllm.model_executor.layers.fused_moe.fused_moe import *
 | 
				
			||||||
from vllm.platforms import current_platform
 | 
					from vllm.platforms import current_platform
 | 
				
			||||||
from vllm.transformers_utils.config import get_config
 | 
					from vllm.transformers_utils.config import get_config
 | 
				
			||||||
@ -134,10 +138,25 @@ def benchmark_config(
 | 
				
			|||||||
    def run():
 | 
					    def run():
 | 
				
			||||||
        from vllm.model_executor.layers.fused_moe import override_config
 | 
					        from vllm.model_executor.layers.fused_moe import override_config
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if use_fp8_w8a8:
 | 
				
			||||||
 | 
					            quant_dtype = torch.float8_e4m3fn
 | 
				
			||||||
 | 
					        elif use_int8_w8a16:
 | 
				
			||||||
 | 
					            quant_dtype = torch.int8
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            quant_dtype = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        quant_config = FusedMoEQuantConfig.make(
 | 
				
			||||||
 | 
					            quant_dtype=quant_dtype,
 | 
				
			||||||
 | 
					            w1_scale=w1_scale,
 | 
				
			||||||
 | 
					            w2_scale=w2_scale,
 | 
				
			||||||
 | 
					            a1_scale=a1_scale,
 | 
				
			||||||
 | 
					            a2_scale=a2_scale,
 | 
				
			||||||
 | 
					            block_shape=block_quant_shape,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        with override_config(config):
 | 
					        with override_config(config):
 | 
				
			||||||
            if use_deep_gemm:
 | 
					 | 
				
			||||||
            topk_weights, topk_ids, token_expert_indices = fused_topk(
 | 
					            topk_weights, topk_ids, token_expert_indices = fused_topk(
 | 
				
			||||||
                    x, input_gating, topk, False
 | 
					                x, input_gating, topk, renormalize=not use_deep_gemm
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
            return fused_experts(
 | 
					            return fused_experts(
 | 
				
			||||||
                x,
 | 
					                x,
 | 
				
			||||||
@ -146,30 +165,8 @@ def benchmark_config(
 | 
				
			|||||||
                topk_weights,
 | 
					                topk_weights,
 | 
				
			||||||
                topk_ids,
 | 
					                topk_ids,
 | 
				
			||||||
                inplace=True,
 | 
					                inplace=True,
 | 
				
			||||||
                    use_fp8_w8a8=use_fp8_w8a8,
 | 
					                quant_config=quant_config,
 | 
				
			||||||
                    w1_scale=w1_scale,
 | 
					                allow_deep_gemm=use_deep_gemm,
 | 
				
			||||||
                    w2_scale=w2_scale,
 | 
					 | 
				
			||||||
                    a1_scale=a1_scale,
 | 
					 | 
				
			||||||
                    a2_scale=a2_scale,
 | 
					 | 
				
			||||||
                    block_shape=block_quant_shape,
 | 
					 | 
				
			||||||
                    allow_deep_gemm=True,
 | 
					 | 
				
			||||||
                )
 | 
					 | 
				
			||||||
            else:
 | 
					 | 
				
			||||||
                fused_moe(
 | 
					 | 
				
			||||||
                    x,
 | 
					 | 
				
			||||||
                    w1,
 | 
					 | 
				
			||||||
                    w2,
 | 
					 | 
				
			||||||
                    input_gating,
 | 
					 | 
				
			||||||
                    topk,
 | 
					 | 
				
			||||||
                    renormalize=True,
 | 
					 | 
				
			||||||
                    inplace=True,
 | 
					 | 
				
			||||||
                    use_fp8_w8a8=use_fp8_w8a8,
 | 
					 | 
				
			||||||
                    use_int8_w8a16=use_int8_w8a16,
 | 
					 | 
				
			||||||
                    w1_scale=w1_scale,
 | 
					 | 
				
			||||||
                    w2_scale=w2_scale,
 | 
					 | 
				
			||||||
                    a1_scale=a1_scale,
 | 
					 | 
				
			||||||
                    a2_scale=a2_scale,
 | 
					 | 
				
			||||||
                    block_shape=block_quant_shape,
 | 
					 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # JIT compilation & warmup
 | 
					    # JIT compilation & warmup
 | 
				
			||||||
@ -414,7 +411,7 @@ class BenchmarkWorker:
 | 
				
			|||||||
        use_deep_gemm: bool = False,
 | 
					        use_deep_gemm: bool = False,
 | 
				
			||||||
    ) -> tuple[dict[str, int], float]:
 | 
					    ) -> tuple[dict[str, int], float]:
 | 
				
			||||||
        current_platform.seed_everything(self.seed)
 | 
					        current_platform.seed_everything(self.seed)
 | 
				
			||||||
        dtype_str = get_config_dtype_str(
 | 
					        dtype_str = _get_config_dtype_str(
 | 
				
			||||||
            dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
 | 
					            dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
        # NOTE(woosuk): The current naming convention uses w2.shape[2], which
 | 
					        # NOTE(woosuk): The current naming convention uses w2.shape[2], which
 | 
				
			||||||
@ -547,7 +544,7 @@ def save_configs(
 | 
				
			|||||||
    block_quant_shape: list[int],
 | 
					    block_quant_shape: list[int],
 | 
				
			||||||
    save_dir: str,
 | 
					    save_dir: str,
 | 
				
			||||||
) -> None:
 | 
					) -> None:
 | 
				
			||||||
    dtype_str = get_config_dtype_str(
 | 
					    dtype_str = _get_config_dtype_str(
 | 
				
			||||||
        dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
 | 
					        dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -560,7 +557,7 @@ def save_configs(
 | 
				
			|||||||
    filename = os.path.join(save_dir, filename)
 | 
					    filename = os.path.join(save_dir, filename)
 | 
				
			||||||
    print(f"Writing best config to {filename}...")
 | 
					    print(f"Writing best config to {filename}...")
 | 
				
			||||||
    with open(filename, "w") as f:
 | 
					    with open(filename, "w") as f:
 | 
				
			||||||
        json.dump(configs, f, indent=4)
 | 
					        json.dump({"triton_version": triton.__version__, **configs}, f, indent=4)
 | 
				
			||||||
        f.write("\n")
 | 
					        f.write("\n")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -582,18 +579,22 @@ def main(args: argparse.Namespace):
 | 
				
			|||||||
        E = config.ffn_config.moe_num_experts
 | 
					        E = config.ffn_config.moe_num_experts
 | 
				
			||||||
        topk = config.ffn_config.moe_top_k
 | 
					        topk = config.ffn_config.moe_top_k
 | 
				
			||||||
        intermediate_size = config.ffn_config.ffn_hidden_size
 | 
					        intermediate_size = config.ffn_config.ffn_hidden_size
 | 
				
			||||||
 | 
					        hidden_size = config.hidden_size
 | 
				
			||||||
    elif config.architectures[0] == "JambaForCausalLM":
 | 
					    elif config.architectures[0] == "JambaForCausalLM":
 | 
				
			||||||
        E = config.num_experts
 | 
					        E = config.num_experts
 | 
				
			||||||
        topk = config.num_experts_per_tok
 | 
					        topk = config.num_experts_per_tok
 | 
				
			||||||
        intermediate_size = config.intermediate_size
 | 
					        intermediate_size = config.intermediate_size
 | 
				
			||||||
 | 
					        hidden_size = config.hidden_size
 | 
				
			||||||
    elif config.architectures[0] in (
 | 
					    elif config.architectures[0] in (
 | 
				
			||||||
        "DeepseekV3ForCausalLM",
 | 
					 | 
				
			||||||
        "DeepseekV2ForCausalLM",
 | 
					        "DeepseekV2ForCausalLM",
 | 
				
			||||||
 | 
					        "DeepseekV3ForCausalLM",
 | 
				
			||||||
 | 
					        "DeepseekV32ForCausalLM",
 | 
				
			||||||
        "Glm4MoeForCausalLM",
 | 
					        "Glm4MoeForCausalLM",
 | 
				
			||||||
    ):
 | 
					    ):
 | 
				
			||||||
        E = config.n_routed_experts
 | 
					        E = config.n_routed_experts
 | 
				
			||||||
        topk = config.num_experts_per_tok
 | 
					        topk = config.num_experts_per_tok
 | 
				
			||||||
        intermediate_size = config.moe_intermediate_size
 | 
					        intermediate_size = config.moe_intermediate_size
 | 
				
			||||||
 | 
					        hidden_size = config.hidden_size
 | 
				
			||||||
    elif config.architectures[0] in (
 | 
					    elif config.architectures[0] in (
 | 
				
			||||||
        "Qwen2MoeForCausalLM",
 | 
					        "Qwen2MoeForCausalLM",
 | 
				
			||||||
        "Qwen3MoeForCausalLM",
 | 
					        "Qwen3MoeForCausalLM",
 | 
				
			||||||
@ -602,10 +603,18 @@ def main(args: argparse.Namespace):
 | 
				
			|||||||
        E = config.num_experts
 | 
					        E = config.num_experts
 | 
				
			||||||
        topk = config.num_experts_per_tok
 | 
					        topk = config.num_experts_per_tok
 | 
				
			||||||
        intermediate_size = config.moe_intermediate_size
 | 
					        intermediate_size = config.moe_intermediate_size
 | 
				
			||||||
 | 
					        hidden_size = config.hidden_size
 | 
				
			||||||
 | 
					    elif config.architectures[0] == "Qwen3VLMoeForConditionalGeneration":
 | 
				
			||||||
 | 
					        text_config = config.get_text_config()
 | 
				
			||||||
 | 
					        E = text_config.num_experts
 | 
				
			||||||
 | 
					        topk = text_config.num_experts_per_tok
 | 
				
			||||||
 | 
					        intermediate_size = text_config.moe_intermediate_size
 | 
				
			||||||
 | 
					        hidden_size = text_config.hidden_size
 | 
				
			||||||
    elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
 | 
					    elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
 | 
				
			||||||
        E = config.num_experts
 | 
					        E = config.num_experts
 | 
				
			||||||
        topk = config.moe_topk[0]
 | 
					        topk = config.moe_topk[0]
 | 
				
			||||||
        intermediate_size = config.moe_intermediate_size[0]
 | 
					        intermediate_size = config.moe_intermediate_size[0]
 | 
				
			||||||
 | 
					        hidden_size = config.hidden_size
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        # Support for llama4
 | 
					        # Support for llama4
 | 
				
			||||||
        config = config.get_text_config()
 | 
					        config = config.get_text_config()
 | 
				
			||||||
@ -613,6 +622,7 @@ def main(args: argparse.Namespace):
 | 
				
			|||||||
        E = config.num_local_experts
 | 
					        E = config.num_local_experts
 | 
				
			||||||
        topk = config.num_experts_per_tok
 | 
					        topk = config.num_experts_per_tok
 | 
				
			||||||
        intermediate_size = config.intermediate_size
 | 
					        intermediate_size = config.intermediate_size
 | 
				
			||||||
 | 
					        hidden_size = config.hidden_size
 | 
				
			||||||
    enable_ep = bool(args.enable_expert_parallel)
 | 
					    enable_ep = bool(args.enable_expert_parallel)
 | 
				
			||||||
    if enable_ep:
 | 
					    if enable_ep:
 | 
				
			||||||
        ensure_divisibility(E, args.tp_size, "Number of experts")
 | 
					        ensure_divisibility(E, args.tp_size, "Number of experts")
 | 
				
			||||||
@ -621,8 +631,7 @@ def main(args: argparse.Namespace):
 | 
				
			|||||||
    else:
 | 
					    else:
 | 
				
			||||||
        ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
 | 
					        ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
 | 
				
			||||||
        shard_intermediate_size = 2 * intermediate_size // args.tp_size
 | 
					        shard_intermediate_size = 2 * intermediate_size // args.tp_size
 | 
				
			||||||
    hidden_size = config.hidden_size
 | 
					    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
 | 
				
			||||||
    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
 | 
					 | 
				
			||||||
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
 | 
					    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
 | 
				
			||||||
    use_int8_w8a16 = args.dtype == "int8_w8a16"
 | 
					    use_int8_w8a16 = args.dtype == "int8_w8a16"
 | 
				
			||||||
    block_quant_shape = get_weight_block_size_safety(config)
 | 
					    block_quant_shape = get_weight_block_size_safety(config)
 | 
				
			||||||
 | 
				
			|||||||
@ -344,7 +344,7 @@ def main(args: argparse.Namespace):
 | 
				
			|||||||
        topk = config.num_experts_per_tok
 | 
					        topk = config.num_experts_per_tok
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    hidden_size = config.hidden_size
 | 
					    hidden_size = config.hidden_size
 | 
				
			||||||
    dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
 | 
					    dtype = torch.float16 if current_platform.is_rocm() else config.dtype
 | 
				
			||||||
    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
 | 
					    use_fp8_w8a8 = args.dtype == "fp8_w8a8"
 | 
				
			||||||
    use_int8_w8a16 = args.dtype == "int8_w8a16"
 | 
					    use_int8_w8a16 = args.dtype == "int8_w8a16"
 | 
				
			||||||
    use_customized_permute = args.use_customized_permute
 | 
					    use_customized_permute = args.use_customized_permute
 | 
				
			||||||
 | 
				
			|||||||
@ -3,16 +3,15 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
import time
 | 
					import time
 | 
				
			||||||
from typing import Optional
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
import torch
 | 
					import torch
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from vllm import _custom_ops as ops
 | 
					from vllm import _custom_ops as ops
 | 
				
			||||||
from vllm.logger import init_logger
 | 
					from vllm.logger import init_logger
 | 
				
			||||||
from vllm.platforms import current_platform
 | 
					from vllm.platforms import current_platform
 | 
				
			||||||
from vllm.utils import (
 | 
					from vllm.utils import FlexibleArgumentParser
 | 
				
			||||||
 | 
					from vllm.utils.torch_utils import (
 | 
				
			||||||
    STR_DTYPE_TO_TORCH_DTYPE,
 | 
					    STR_DTYPE_TO_TORCH_DTYPE,
 | 
				
			||||||
    FlexibleArgumentParser,
 | 
					 | 
				
			||||||
    create_kv_caches_with_random,
 | 
					    create_kv_caches_with_random,
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -37,7 +36,7 @@ def main(
 | 
				
			|||||||
    seed: int,
 | 
					    seed: int,
 | 
				
			||||||
    do_profile: bool,
 | 
					    do_profile: bool,
 | 
				
			||||||
    device: str = "cuda",
 | 
					    device: str = "cuda",
 | 
				
			||||||
    kv_cache_dtype: Optional[str] = None,
 | 
					    kv_cache_dtype: str | None = None,
 | 
				
			||||||
) -> None:
 | 
					) -> None:
 | 
				
			||||||
    current_platform.seed_everything(seed)
 | 
					    current_platform.seed_everything(seed)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -3,8 +3,8 @@
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
import argparse
 | 
					import argparse
 | 
				
			||||||
import math
 | 
					import math
 | 
				
			||||||
 | 
					from collections.abc import Callable
 | 
				
			||||||
from contextlib import contextmanager
 | 
					from contextlib import contextmanager
 | 
				
			||||||
from typing import Callable
 | 
					 | 
				
			||||||
from unittest.mock import patch
 | 
					from unittest.mock import patch
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import torch
 | 
					import torch
 | 
				
			||||||
 | 
				
			|||||||
@ -1,155 +0,0 @@
 | 
				
			|||||||
# SPDX-License-Identifier: Apache-2.0
 | 
					 | 
				
			||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import itertools
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import torch
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
from vllm import _custom_ops as vllm_ops
 | 
					 | 
				
			||||||
from vllm.triton_utils import triton
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def polynorm_naive(
 | 
					 | 
				
			||||||
    x: torch.Tensor,
 | 
					 | 
				
			||||||
    weight: torch.Tensor,
 | 
					 | 
				
			||||||
    bias: torch.Tensor,
 | 
					 | 
				
			||||||
    eps: float = 1e-6,
 | 
					 | 
				
			||||||
):
 | 
					 | 
				
			||||||
    orig_shape = x.shape
 | 
					 | 
				
			||||||
    x = x.view(-1, x.shape[-1])
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    def norm(x, eps: float):
 | 
					 | 
				
			||||||
        return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    x = x.float()
 | 
					 | 
				
			||||||
    return (
 | 
					 | 
				
			||||||
        (
 | 
					 | 
				
			||||||
            weight[0] * norm(x**3, eps)
 | 
					 | 
				
			||||||
            + weight[1] * norm(x**2, eps)
 | 
					 | 
				
			||||||
            + weight[2] * norm(x, eps)
 | 
					 | 
				
			||||||
            + bias
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
        .to(weight.dtype)
 | 
					 | 
				
			||||||
        .view(orig_shape)
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def polynorm_vllm(
 | 
					 | 
				
			||||||
    x: torch.Tensor,
 | 
					 | 
				
			||||||
    weight: torch.Tensor,
 | 
					 | 
				
			||||||
    bias: torch.Tensor,
 | 
					 | 
				
			||||||
    eps: float = 1e-6,
 | 
					 | 
				
			||||||
):
 | 
					 | 
				
			||||||
    orig_shape = x.shape
 | 
					 | 
				
			||||||
    x = x.view(-1, x.shape[-1])
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    out = torch.empty_like(x)
 | 
					 | 
				
			||||||
    vllm_ops.poly_norm(out, x, weight, bias, eps)
 | 
					 | 
				
			||||||
    output = out
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    output = output.view(orig_shape)
 | 
					 | 
				
			||||||
    return output
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def calculate_diff(batch_size, seq_len, hidden_dim):
 | 
					 | 
				
			||||||
    dtype = torch.bfloat16
 | 
					 | 
				
			||||||
    x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
 | 
					 | 
				
			||||||
    weight = torch.ones(3, dtype=dtype, device="cuda")
 | 
					 | 
				
			||||||
    bias = torch.ones(1, dtype=dtype, device="cuda")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    output_naive = polynorm_naive(x, weight, bias)
 | 
					 | 
				
			||||||
    output_vllm = polynorm_vllm(x, weight, bias)
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2):
 | 
					 | 
				
			||||||
        print("✅ All implementations match")
 | 
					 | 
				
			||||||
    else:
 | 
					 | 
				
			||||||
        print("❌ Implementations differ")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
batch_size_range = [2**i for i in range(0, 7, 2)]
 | 
					 | 
				
			||||||
seq_length_range = [2**i for i in range(6, 11, 1)]
 | 
					 | 
				
			||||||
dim_range = [2048, 4096]
 | 
					 | 
				
			||||||
configs = list(itertools.product(dim_range, batch_size_range, seq_length_range))
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def get_benchmark():
 | 
					 | 
				
			||||||
    @triton.testing.perf_report(
 | 
					 | 
				
			||||||
        triton.testing.Benchmark(
 | 
					 | 
				
			||||||
            x_names=["dim", "batch_size", "seq_len"],
 | 
					 | 
				
			||||||
            x_vals=[list(_) for _ in configs],
 | 
					 | 
				
			||||||
            line_arg="provider",
 | 
					 | 
				
			||||||
            line_vals=["naive", "vllm"],
 | 
					 | 
				
			||||||
            line_names=["Naive", "vLLM"],
 | 
					 | 
				
			||||||
            styles=[("blue", "-"), ("red", "-")],
 | 
					 | 
				
			||||||
            ylabel="us",
 | 
					 | 
				
			||||||
            plot_name="polynorm-perf",
 | 
					 | 
				
			||||||
            args={},
 | 
					 | 
				
			||||||
        )
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    def benchmark(dim, batch_size, seq_len, provider):
 | 
					 | 
				
			||||||
        dtype = torch.bfloat16
 | 
					 | 
				
			||||||
        hidden_dim = dim * 4
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda")
 | 
					 | 
				
			||||||
        weight = torch.ones(3, dtype=dtype, device="cuda")
 | 
					 | 
				
			||||||
        bias = torch.ones(1, dtype=dtype, device="cuda")
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        quantiles = [0.5, 0.2, 0.8]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        if provider == "naive":
 | 
					 | 
				
			||||||
            ms, min_ms, max_ms = triton.testing.do_bench(
 | 
					 | 
				
			||||||
                lambda: polynorm_naive(x, weight, bias),
 | 
					 | 
				
			||||||
                quantiles=quantiles,
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            ms, min_ms, max_ms = triton.testing.do_bench(
 | 
					 | 
				
			||||||
                lambda: polynorm_vllm(x, weight, bias),
 | 
					 | 
				
			||||||
                quantiles=quantiles,
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
        return 1000 * ms, 1000 * max_ms, 1000 * min_ms
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    return benchmark
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
if __name__ == "__main__":
 | 
					 | 
				
			||||||
    import argparse
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    parser = argparse.ArgumentParser()
 | 
					 | 
				
			||||||
    parser.add_argument(
 | 
					 | 
				
			||||||
        "--batch-size",
 | 
					 | 
				
			||||||
        type=int,
 | 
					 | 
				
			||||||
        default=4,
 | 
					 | 
				
			||||||
        help="Batch size",
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    parser.add_argument(
 | 
					 | 
				
			||||||
        "--seq-len",
 | 
					 | 
				
			||||||
        type=int,
 | 
					 | 
				
			||||||
        default=128,
 | 
					 | 
				
			||||||
        help="Sequence length",
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    parser.add_argument(
 | 
					 | 
				
			||||||
        "--hidden-dim",
 | 
					 | 
				
			||||||
        type=int,
 | 
					 | 
				
			||||||
        default=8192,
 | 
					 | 
				
			||||||
        help="Intermediate size of MLP",
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
    parser.add_argument(
 | 
					 | 
				
			||||||
        "--save-path",
 | 
					 | 
				
			||||||
        type=str,
 | 
					 | 
				
			||||||
        default="./configs/polnorm/",
 | 
					 | 
				
			||||||
        help="Path to save polnorm benchmark results",
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    args = parser.parse_args()
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    # Run correctness test
 | 
					 | 
				
			||||||
    calculate_diff(
 | 
					 | 
				
			||||||
        batch_size=args.batch_size,
 | 
					 | 
				
			||||||
        seq_len=args.seq_len,
 | 
					 | 
				
			||||||
        hidden_dim=args.hidden_dim,
 | 
					 | 
				
			||||||
    )
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
    benchmark = get_benchmark()
 | 
					 | 
				
			||||||
    # Run performance benchmark
 | 
					 | 
				
			||||||
    benchmark.run(print_data=True, save_path=args.save_path)
 | 
					 | 
				
			||||||
@ -7,7 +7,8 @@ import torch
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
from vllm import _custom_ops as ops
 | 
					from vllm import _custom_ops as ops
 | 
				
			||||||
from vllm.platforms import current_platform
 | 
					from vllm.platforms import current_platform
 | 
				
			||||||
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser
 | 
					from vllm.utils import FlexibleArgumentParser
 | 
				
			||||||
 | 
					from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@torch.inference_mode()
 | 
					@torch.inference_mode()
 | 
				
			||||||
 | 
				
			|||||||
							
								
								
									
										172
									
								
								benchmarks/kernels/benchmark_reshape_and_cache.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										172
									
								
								benchmarks/kernels/benchmark_reshape_and_cache.py
									
									
									
									
									
										Normal file
									
								
							@ -0,0 +1,172 @@
 | 
				
			|||||||
 | 
					# SPDX-License-Identifier: Apache-2.0
 | 
				
			||||||
 | 
					# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
				
			||||||
 | 
					import random
 | 
				
			||||||
 | 
					import time
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import torch
 | 
				
			||||||
 | 
					from tabulate import tabulate
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from vllm import _custom_ops as ops
 | 
				
			||||||
 | 
					from vllm.logger import init_logger
 | 
				
			||||||
 | 
					from vllm.platforms import current_platform
 | 
				
			||||||
 | 
					from vllm.utils import FlexibleArgumentParser
 | 
				
			||||||
 | 
					from vllm.utils.torch_utils import (
 | 
				
			||||||
 | 
					    STR_DTYPE_TO_TORCH_DTYPE,
 | 
				
			||||||
 | 
					    create_kv_caches_with_random,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					logger = init_logger(__name__)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@torch.inference_mode()
 | 
				
			||||||
 | 
					def run_benchmark(
 | 
				
			||||||
 | 
					    num_tokens: int,
 | 
				
			||||||
 | 
					    num_heads: int,
 | 
				
			||||||
 | 
					    head_size: int,
 | 
				
			||||||
 | 
					    block_size: int,
 | 
				
			||||||
 | 
					    num_blocks: int,
 | 
				
			||||||
 | 
					    dtype: torch.dtype,
 | 
				
			||||||
 | 
					    kv_cache_dtype: str,
 | 
				
			||||||
 | 
					    num_iters: int,
 | 
				
			||||||
 | 
					    benchmark_mode: str,
 | 
				
			||||||
 | 
					    device: str = "cuda",
 | 
				
			||||||
 | 
					) -> float:
 | 
				
			||||||
 | 
					    """Return latency (seconds) for given num_tokens."""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if kv_cache_dtype == "fp8" and head_size % 16:
 | 
				
			||||||
 | 
					        raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    current_platform.seed_everything(42)
 | 
				
			||||||
 | 
					    torch.set_default_device(device)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # create random key / value tensors [T, H, D].
 | 
				
			||||||
 | 
					    key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device)
 | 
				
			||||||
 | 
					    value = torch.randn_like(key)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # prepare the slot mapping.
 | 
				
			||||||
 | 
					    # each token is assigned a unique slot in the KV-cache.
 | 
				
			||||||
 | 
					    num_slots = block_size * num_blocks
 | 
				
			||||||
 | 
					    if num_tokens > num_slots:
 | 
				
			||||||
 | 
					        raise ValueError("num_tokens cannot exceed the total number of cache slots")
 | 
				
			||||||
 | 
					    slot_mapping_lst = random.sample(range(num_slots), num_tokens)
 | 
				
			||||||
 | 
					    slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    key_caches, value_caches = create_kv_caches_with_random(
 | 
				
			||||||
 | 
					        num_blocks,
 | 
				
			||||||
 | 
					        block_size,
 | 
				
			||||||
 | 
					        1,  # num_layers
 | 
				
			||||||
 | 
					        num_heads,
 | 
				
			||||||
 | 
					        head_size,
 | 
				
			||||||
 | 
					        kv_cache_dtype,
 | 
				
			||||||
 | 
					        dtype,
 | 
				
			||||||
 | 
					        device=device,
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    key_cache, value_cache = key_caches[0], value_caches[0]
 | 
				
			||||||
 | 
					    # to free unused memory
 | 
				
			||||||
 | 
					    del key_caches, value_caches
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # compute per-kernel scaling factors for fp8 conversion (if used).
 | 
				
			||||||
 | 
					    k_scale = (key.amax() / 64.0).to(torch.float32)
 | 
				
			||||||
 | 
					    v_scale = (value.amax() / 64.0).to(torch.float32)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    function_under_test = lambda: ops.reshape_and_cache(
 | 
				
			||||||
 | 
					        key,  # noqa: F821
 | 
				
			||||||
 | 
					        value,  # noqa: F821
 | 
				
			||||||
 | 
					        key_cache,  # noqa: F821
 | 
				
			||||||
 | 
					        value_cache,  # noqa: F821
 | 
				
			||||||
 | 
					        slot_mapping,  # noqa: F821
 | 
				
			||||||
 | 
					        kv_cache_dtype,
 | 
				
			||||||
 | 
					        k_scale,
 | 
				
			||||||
 | 
					        v_scale,
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if benchmark_mode == "cudagraph":
 | 
				
			||||||
 | 
					        g = torch.cuda.CUDAGraph()
 | 
				
			||||||
 | 
					        with torch.cuda.graph(g):
 | 
				
			||||||
 | 
					            function_under_test()
 | 
				
			||||||
 | 
					        torch.cuda.synchronize()
 | 
				
			||||||
 | 
					        function_under_test = lambda: g.replay()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def run_cuda_benchmark(n_iters: int) -> float:
 | 
				
			||||||
 | 
					        nonlocal key, value, key_cache, value_cache, slot_mapping
 | 
				
			||||||
 | 
					        torch.cuda.synchronize()
 | 
				
			||||||
 | 
					        start = time.perf_counter()
 | 
				
			||||||
 | 
					        for _ in range(n_iters):
 | 
				
			||||||
 | 
					            function_under_test()
 | 
				
			||||||
 | 
					            torch.cuda.synchronize()
 | 
				
			||||||
 | 
					        end = time.perf_counter()
 | 
				
			||||||
 | 
					        return (end - start) / n_iters
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # warm-up
 | 
				
			||||||
 | 
					    run_cuda_benchmark(3)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    lat = run_cuda_benchmark(num_iters)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # free tensors to mitigate OOM when sweeping
 | 
				
			||||||
 | 
					    del key, value, key_cache, value_cache, slot_mapping
 | 
				
			||||||
 | 
					    torch.cuda.empty_cache()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return lat
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def main(args):
 | 
				
			||||||
 | 
					    rows = []
 | 
				
			||||||
 | 
					    for exp in range(1, 17):
 | 
				
			||||||
 | 
					        n_tok = 2**exp
 | 
				
			||||||
 | 
					        lat = run_benchmark(
 | 
				
			||||||
 | 
					            num_tokens=n_tok,
 | 
				
			||||||
 | 
					            num_heads=args.num_heads,
 | 
				
			||||||
 | 
					            head_size=args.head_size,
 | 
				
			||||||
 | 
					            block_size=args.block_size,
 | 
				
			||||||
 | 
					            num_blocks=args.num_blocks,
 | 
				
			||||||
 | 
					            dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
 | 
				
			||||||
 | 
					            kv_cache_dtype=args.kv_cache_dtype,
 | 
				
			||||||
 | 
					            num_iters=args.iters,
 | 
				
			||||||
 | 
					            benchmark_mode=args.mode,
 | 
				
			||||||
 | 
					            device="cuda",
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        rows.append([n_tok, lat * 1e6])  # convert to microseconds
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    print(f"Benchmark results for implementation cuda (measuring with {args.mode}):")
 | 
				
			||||||
 | 
					    print(tabulate(rows, headers=["num_tokens", "latency (µs)"], floatfmt=".3f"))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					if __name__ == "__main__":
 | 
				
			||||||
 | 
					    parser = FlexibleArgumentParser()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    parser.add_argument("--num-heads", type=int, default=128)
 | 
				
			||||||
 | 
					    parser.add_argument(
 | 
				
			||||||
 | 
					        "--head-size",
 | 
				
			||||||
 | 
					        type=int,
 | 
				
			||||||
 | 
					        choices=[64, 80, 96, 112, 120, 128, 192, 256],
 | 
				
			||||||
 | 
					        default=128,
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
 | 
				
			||||||
 | 
					    parser.add_argument("--num-blocks", type=int, default=128 * 128)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    parser.add_argument(
 | 
				
			||||||
 | 
					        "--dtype",
 | 
				
			||||||
 | 
					        type=str,
 | 
				
			||||||
 | 
					        choices=["half", "bfloat16", "float"],
 | 
				
			||||||
 | 
					        default="bfloat16",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    parser.add_argument(
 | 
				
			||||||
 | 
					        "--kv-cache-dtype",
 | 
				
			||||||
 | 
					        type=str,
 | 
				
			||||||
 | 
					        choices=["auto", "fp8"],
 | 
				
			||||||
 | 
					        default="auto",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    parser.add_argument("--iters", type=int, default=200)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    parser.add_argument(
 | 
				
			||||||
 | 
					        "--mode",
 | 
				
			||||||
 | 
					        type=str,
 | 
				
			||||||
 | 
					        choices=["cudagraph", "no_graph"],
 | 
				
			||||||
 | 
					        default="cudagraph",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    args = parser.parse_args()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    main(args)
 | 
				
			||||||
@ -1,7 +1,5 @@
 | 
				
			|||||||
# SPDX-License-Identifier: Apache-2.0
 | 
					# SPDX-License-Identifier: Apache-2.0
 | 
				
			||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
					# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
				
			||||||
from __future__ import annotations
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
import time
 | 
					import time
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -9,11 +7,14 @@ import torch
 | 
				
			|||||||
from tabulate import tabulate
 | 
					from tabulate import tabulate
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from vllm import _custom_ops as ops
 | 
					from vllm import _custom_ops as ops
 | 
				
			||||||
 | 
					from vllm.attention.ops.triton_reshape_and_cache_flash import (
 | 
				
			||||||
 | 
					    triton_reshape_and_cache_flash,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
from vllm.logger import init_logger
 | 
					from vllm.logger import init_logger
 | 
				
			||||||
from vllm.platforms import current_platform
 | 
					from vllm.platforms import current_platform
 | 
				
			||||||
from vllm.utils import (
 | 
					from vllm.utils import FlexibleArgumentParser
 | 
				
			||||||
 | 
					from vllm.utils.torch_utils import (
 | 
				
			||||||
    STR_DTYPE_TO_TORCH_DTYPE,
 | 
					    STR_DTYPE_TO_TORCH_DTYPE,
 | 
				
			||||||
    FlexibleArgumentParser,
 | 
					 | 
				
			||||||
    create_kv_caches_with_random_flash,
 | 
					    create_kv_caches_with_random_flash,
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -31,6 +32,8 @@ def run_benchmark(
 | 
				
			|||||||
    kv_cache_dtype: str,
 | 
					    kv_cache_dtype: str,
 | 
				
			||||||
    kv_cache_layout: str,
 | 
					    kv_cache_layout: str,
 | 
				
			||||||
    num_iters: int,
 | 
					    num_iters: int,
 | 
				
			||||||
 | 
					    implementation: str,
 | 
				
			||||||
 | 
					    benchmark_mode: str,
 | 
				
			||||||
    device: str = "cuda",
 | 
					    device: str = "cuda",
 | 
				
			||||||
) -> float:
 | 
					) -> float:
 | 
				
			||||||
    """Return latency (seconds) for given num_tokens."""
 | 
					    """Return latency (seconds) for given num_tokens."""
 | 
				
			||||||
@ -38,6 +41,14 @@ def run_benchmark(
 | 
				
			|||||||
    if kv_cache_dtype == "fp8" and head_size % 16:
 | 
					    if kv_cache_dtype == "fp8" and head_size % 16:
 | 
				
			||||||
        raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
 | 
					        raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if implementation not in ("cuda", "triton"):
 | 
				
			||||||
 | 
					        raise ValueError(
 | 
				
			||||||
 | 
					            f"Unsupported implementation: {implementation}. "
 | 
				
			||||||
 | 
					            "Only 'cuda' and 'triton' are supported."
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    if implementation == "triton" and kv_cache_layout == "HND":
 | 
				
			||||||
 | 
					        return float("nan")  # Triton does not support HND layout yet.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    current_platform.seed_everything(42)
 | 
					    current_platform.seed_everything(42)
 | 
				
			||||||
    torch.set_default_device(device)
 | 
					    torch.set_default_device(device)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -65,26 +76,48 @@ def run_benchmark(
 | 
				
			|||||||
        cache_layout=kv_cache_layout,
 | 
					        cache_layout=kv_cache_layout,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
    key_cache, value_cache = key_caches[0], value_caches[0]
 | 
					    key_cache, value_cache = key_caches[0], value_caches[0]
 | 
				
			||||||
 | 
					    # to free unused memory
 | 
				
			||||||
 | 
					    del key_caches, value_caches
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # compute per-kernel scaling factors for fp8 conversion (if used).
 | 
					    # compute per-kernel scaling factors for fp8 conversion (if used).
 | 
				
			||||||
    k_scale = (key.amax() / 64.0).to(torch.float32)
 | 
					    k_scale = (key.amax() / 64.0).to(torch.float32)
 | 
				
			||||||
    v_scale = (value.amax() / 64.0).to(torch.float32)
 | 
					    v_scale = (value.amax() / 64.0).to(torch.float32)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if implementation == "cuda":
 | 
				
			||||||
 | 
					        function_under_test = lambda: ops.reshape_and_cache_flash(
 | 
				
			||||||
 | 
					            key,  # noqa: F821
 | 
				
			||||||
 | 
					            value,  # noqa: F821
 | 
				
			||||||
 | 
					            key_cache,  # noqa: F821
 | 
				
			||||||
 | 
					            value_cache,  # noqa: F821
 | 
				
			||||||
 | 
					            slot_mapping,  # noqa: F821
 | 
				
			||||||
 | 
					            kv_cache_dtype,
 | 
				
			||||||
 | 
					            k_scale,
 | 
				
			||||||
 | 
					            v_scale,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    else:
 | 
				
			||||||
 | 
					        function_under_test = lambda: triton_reshape_and_cache_flash(
 | 
				
			||||||
 | 
					            key,  # noqa: F821
 | 
				
			||||||
 | 
					            value,  # noqa: F821
 | 
				
			||||||
 | 
					            key_cache,  # noqa: F821
 | 
				
			||||||
 | 
					            value_cache,  # noqa: F821
 | 
				
			||||||
 | 
					            slot_mapping,  # noqa: F821
 | 
				
			||||||
 | 
					            kv_cache_dtype,
 | 
				
			||||||
 | 
					            k_scale,
 | 
				
			||||||
 | 
					            v_scale,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    if benchmark_mode == "cudagraph":
 | 
				
			||||||
 | 
					        g = torch.cuda.CUDAGraph()
 | 
				
			||||||
 | 
					        with torch.cuda.graph(g):
 | 
				
			||||||
 | 
					            function_under_test()
 | 
				
			||||||
 | 
					        torch.cuda.synchronize()
 | 
				
			||||||
 | 
					        function_under_test = lambda: g.replay()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def run_cuda_benchmark(n_iters: int) -> float:
 | 
					    def run_cuda_benchmark(n_iters: int) -> float:
 | 
				
			||||||
        nonlocal key, value, key_cache, value_cache, slot_mapping
 | 
					        nonlocal key, value, key_cache, value_cache, slot_mapping
 | 
				
			||||||
        torch.cuda.synchronize()
 | 
					        torch.cuda.synchronize()
 | 
				
			||||||
        start = time.perf_counter()
 | 
					        start = time.perf_counter()
 | 
				
			||||||
        for _ in range(n_iters):
 | 
					        for _ in range(n_iters):
 | 
				
			||||||
            ops.reshape_and_cache_flash(
 | 
					            function_under_test()
 | 
				
			||||||
                key,
 | 
					 | 
				
			||||||
                value,
 | 
					 | 
				
			||||||
                key_cache,
 | 
					 | 
				
			||||||
                value_cache,
 | 
					 | 
				
			||||||
                slot_mapping,
 | 
					 | 
				
			||||||
                kv_cache_dtype,
 | 
					 | 
				
			||||||
                k_scale,
 | 
					 | 
				
			||||||
                v_scale,
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
            torch.cuda.synchronize()
 | 
					            torch.cuda.synchronize()
 | 
				
			||||||
        end = time.perf_counter()
 | 
					        end = time.perf_counter()
 | 
				
			||||||
        return (end - start) / n_iters
 | 
					        return (end - start) / n_iters
 | 
				
			||||||
@ -116,10 +149,16 @@ def main(args):
 | 
				
			|||||||
                kv_cache_dtype=args.kv_cache_dtype,
 | 
					                kv_cache_dtype=args.kv_cache_dtype,
 | 
				
			||||||
                kv_cache_layout=layout,
 | 
					                kv_cache_layout=layout,
 | 
				
			||||||
                num_iters=args.iters,
 | 
					                num_iters=args.iters,
 | 
				
			||||||
 | 
					                implementation=args.implementation,
 | 
				
			||||||
 | 
					                benchmark_mode=args.mode,
 | 
				
			||||||
                device="cuda",
 | 
					                device="cuda",
 | 
				
			||||||
            )
 | 
					            )
 | 
				
			||||||
            rows.append([n_tok, layout, f"{lat * 1e6:.3f}"])
 | 
					            rows.append([n_tok, layout, f"{lat * 1e6:.3f}"])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    print(
 | 
				
			||||||
 | 
					        f"Benchmark results for implementation {args.implementation}"
 | 
				
			||||||
 | 
					        f" (measuring with {args.mode}):"
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
    print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"]))
 | 
					    print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"]))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -151,6 +190,21 @@ if __name__ == "__main__":
 | 
				
			|||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    parser.add_argument("--iters", type=int, default=100)
 | 
					    parser.add_argument("--iters", type=int, default=100)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    parser.add_argument(
 | 
				
			||||||
 | 
					        "--implementation",
 | 
				
			||||||
 | 
					        type=str,
 | 
				
			||||||
 | 
					        choices=["cuda", "triton"],
 | 
				
			||||||
 | 
					        default="cuda",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    parser.add_argument(
 | 
				
			||||||
 | 
					        "--mode",
 | 
				
			||||||
 | 
					        type=str,
 | 
				
			||||||
 | 
					        choices=["cudagraph", "no_graph"],
 | 
				
			||||||
 | 
					        default="cudagraph",
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    args = parser.parse_args()
 | 
					    args = parser.parse_args()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    main(args)
 | 
					    main(args)
 | 
				
			||||||
 | 
				
			|||||||
@ -2,7 +2,6 @@
 | 
				
			|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
					# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import itertools
 | 
					import itertools
 | 
				
			||||||
from typing import Optional, Union
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
import torch
 | 
					import torch
 | 
				
			||||||
from flashinfer.norm import fused_add_rmsnorm, rmsnorm
 | 
					from flashinfer.norm import fused_add_rmsnorm, rmsnorm
 | 
				
			||||||
@ -21,8 +20,8 @@ class HuggingFaceRMSNorm(nn.Module):
 | 
				
			|||||||
    def forward(
 | 
					    def forward(
 | 
				
			||||||
        self,
 | 
					        self,
 | 
				
			||||||
        x: torch.Tensor,
 | 
					        x: torch.Tensor,
 | 
				
			||||||
        residual: Optional[torch.Tensor] = None,
 | 
					        residual: torch.Tensor | None = None,
 | 
				
			||||||
    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
 | 
					    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]:
 | 
				
			||||||
        orig_dtype = x.dtype
 | 
					        orig_dtype = x.dtype
 | 
				
			||||||
        x = x.to(torch.float32)
 | 
					        x = x.to(torch.float32)
 | 
				
			||||||
        if residual is not None:
 | 
					        if residual is not None:
 | 
				
			||||||
@ -41,7 +40,7 @@ class HuggingFaceRMSNorm(nn.Module):
 | 
				
			|||||||
def rmsnorm_naive(
 | 
					def rmsnorm_naive(
 | 
				
			||||||
    x: torch.Tensor,
 | 
					    x: torch.Tensor,
 | 
				
			||||||
    weight: torch.Tensor,
 | 
					    weight: torch.Tensor,
 | 
				
			||||||
    residual: Optional[torch.Tensor] = None,
 | 
					    residual: torch.Tensor | None = None,
 | 
				
			||||||
    eps: float = 1e-6,
 | 
					    eps: float = 1e-6,
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
 | 
					    naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps)
 | 
				
			||||||
@ -65,7 +64,7 @@ def rmsnorm_naive(
 | 
				
			|||||||
def rmsnorm_flashinfer(
 | 
					def rmsnorm_flashinfer(
 | 
				
			||||||
    x: torch.Tensor,
 | 
					    x: torch.Tensor,
 | 
				
			||||||
    weight: torch.Tensor,
 | 
					    weight: torch.Tensor,
 | 
				
			||||||
    residual: Optional[torch.Tensor] = None,
 | 
					    residual: torch.Tensor | None = None,
 | 
				
			||||||
    eps: float = 1e-6,
 | 
					    eps: float = 1e-6,
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    orig_shape = x.shape
 | 
					    orig_shape = x.shape
 | 
				
			||||||
@ -89,7 +88,7 @@ def rmsnorm_flashinfer(
 | 
				
			|||||||
def rmsnorm_vllm(
 | 
					def rmsnorm_vllm(
 | 
				
			||||||
    x: torch.Tensor,
 | 
					    x: torch.Tensor,
 | 
				
			||||||
    weight: torch.Tensor,
 | 
					    weight: torch.Tensor,
 | 
				
			||||||
    residual: Optional[torch.Tensor] = None,
 | 
					    residual: torch.Tensor | None = None,
 | 
				
			||||||
    eps: float = 1e-6,
 | 
					    eps: float = 1e-6,
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    orig_shape = x.shape
 | 
					    orig_shape = x.shape
 | 
				
			||||||
 | 
				
			|||||||
@ -2,7 +2,6 @@
 | 
				
			|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
					# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from itertools import accumulate
 | 
					from itertools import accumulate
 | 
				
			||||||
from typing import Optional
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
import nvtx
 | 
					import nvtx
 | 
				
			||||||
import torch
 | 
					import torch
 | 
				
			||||||
@ -18,7 +17,7 @@ def benchmark_rope_kernels_multi_lora(
 | 
				
			|||||||
    seq_len: int,
 | 
					    seq_len: int,
 | 
				
			||||||
    num_heads: int,
 | 
					    num_heads: int,
 | 
				
			||||||
    head_size: int,
 | 
					    head_size: int,
 | 
				
			||||||
    rotary_dim: Optional[int],
 | 
					    rotary_dim: int | None,
 | 
				
			||||||
    dtype: torch.dtype,
 | 
					    dtype: torch.dtype,
 | 
				
			||||||
    seed: int,
 | 
					    seed: int,
 | 
				
			||||||
    device: str,
 | 
					    device: str,
 | 
				
			||||||
 | 
				
			|||||||
@ -1,77 +1,720 @@
 | 
				
			|||||||
#!/usr/bin/env python3
 | 
					 | 
				
			||||||
# SPDX-License-Identifier: Apache-2.0
 | 
					# SPDX-License-Identifier: Apache-2.0
 | 
				
			||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
					# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
				
			||||||
import time
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					Comprehensive 3-way SiLU Benchmark Suite
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					This benchmark compares three SiLU implementations:
 | 
				
			||||||
 | 
					1. SiLU V2 (CUDA) - Optimized CUDA kernel implementation
 | 
				
			||||||
 | 
					2. Triton Kernel - Triton-based implementation
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The suite generates detailed performance comparisons including:
 | 
				
			||||||
 | 
					- Memory bandwidth utilization
 | 
				
			||||||
 | 
					- Speedup ratios (baseline vs optimized implementations)
 | 
				
			||||||
 | 
					- Performance across different expert configurations and token distributions
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					from collections.abc import Callable
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					import matplotlib.pyplot as plt
 | 
				
			||||||
 | 
					import numpy as np
 | 
				
			||||||
import torch
 | 
					import torch
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
 | 
					from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
 | 
				
			||||||
    silu_mul_fp8_quant_deep_gemm,
 | 
					    persistent_masked_m_silu_mul_quant,
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
from vllm.platforms import current_platform
 | 
					from vllm.platforms import current_platform
 | 
				
			||||||
 | 
					from vllm.triton_utils import tl, triton
 | 
				
			||||||
 | 
					from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def benchmark(E, T, H, G=128, runs=50):
 | 
					@triton.jit
 | 
				
			||||||
    current_platform.seed_everything(42)
 | 
					def _silu_mul_fp8_quant_deep_gemm(
 | 
				
			||||||
    y = torch.randn((E, T, 2 * H), dtype=torch.bfloat16, device="cuda")
 | 
					    # Pointers ------------------------------------------------------------
 | 
				
			||||||
    tokens_per_expert = torch.randint(
 | 
					    input_ptr,  # 16-bit activations (E, T, 2*H)
 | 
				
			||||||
        T // 2, T, size=(E,), dtype=torch.int32, device="cuda"
 | 
					    y_q_ptr,  # fp8 quantized activations (E, T, H)
 | 
				
			||||||
 | 
					    y_s_ptr,  # 16-bit scales (E, T, G)
 | 
				
			||||||
 | 
					    counts_ptr,  # int32 num tokens per expert (E)
 | 
				
			||||||
 | 
					    # Sizes ---------------------------------------------------------------
 | 
				
			||||||
 | 
					    H: tl.constexpr,  # hidden dimension (per output)
 | 
				
			||||||
 | 
					    GROUP_SIZE: tl.constexpr,  # elements per group (usually 128)
 | 
				
			||||||
 | 
					    # Strides for input (elements) ---------------------------------------
 | 
				
			||||||
 | 
					    stride_i_e,
 | 
				
			||||||
 | 
					    stride_i_t,
 | 
				
			||||||
 | 
					    stride_i_h,
 | 
				
			||||||
 | 
					    # Strides for y_q (elements) -----------------------------------------
 | 
				
			||||||
 | 
					    stride_yq_e,
 | 
				
			||||||
 | 
					    stride_yq_t,
 | 
				
			||||||
 | 
					    stride_yq_h,
 | 
				
			||||||
 | 
					    # Strides for y_s (elements) -----------------------------------------
 | 
				
			||||||
 | 
					    stride_ys_e,
 | 
				
			||||||
 | 
					    stride_ys_t,
 | 
				
			||||||
 | 
					    stride_ys_g,
 | 
				
			||||||
 | 
					    # Stride for counts (elements)
 | 
				
			||||||
 | 
					    stride_counts_e,
 | 
				
			||||||
 | 
					    # Numeric params ------------------------------------------------------
 | 
				
			||||||
 | 
					    eps: tl.constexpr,
 | 
				
			||||||
 | 
					    fp8_min: tl.constexpr,
 | 
				
			||||||
 | 
					    fp8_max: tl.constexpr,
 | 
				
			||||||
 | 
					    use_ue8m0: tl.constexpr,
 | 
				
			||||||
 | 
					    # Meta ---------------------------------------------------------------
 | 
				
			||||||
 | 
					    BLOCK: tl.constexpr,
 | 
				
			||||||
 | 
					    NUM_STAGES: tl.constexpr,
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    G = H // GROUP_SIZE
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # map program id -> (e, g)
 | 
				
			||||||
 | 
					    pid = tl.program_id(0)
 | 
				
			||||||
 | 
					    e = pid // G
 | 
				
			||||||
 | 
					    g = pid % G
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    e = e.to(tl.int64)
 | 
				
			||||||
 | 
					    g = g.to(tl.int64)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # number of valid tokens for this expert
 | 
				
			||||||
 | 
					    n_tokens = tl.load(counts_ptr + e * stride_counts_e).to(tl.int64)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    cols = tl.arange(0, BLOCK).to(tl.int64)
 | 
				
			||||||
 | 
					    mask = cols < BLOCK
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    base_input_offset = e * stride_i_e + g * GROUP_SIZE * stride_i_h
 | 
				
			||||||
 | 
					    base_gate_offset = base_input_offset + cols * stride_i_h
 | 
				
			||||||
 | 
					    base_up_offset = base_input_offset + H * stride_i_h + cols * stride_i_h
 | 
				
			||||||
 | 
					    base_yq_offset = e * stride_yq_e + g * GROUP_SIZE * stride_yq_h + cols * stride_yq_h
 | 
				
			||||||
 | 
					    base_ys_offset = e * stride_ys_e + g * stride_ys_g
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for t in tl.range(0, n_tokens, num_stages=NUM_STAGES):
 | 
				
			||||||
 | 
					        gate = tl.load(
 | 
				
			||||||
 | 
					            input_ptr + base_gate_offset + t * stride_i_t, mask=mask, other=0.0
 | 
				
			||||||
 | 
					        ).to(tl.float32)
 | 
				
			||||||
 | 
					        up = tl.load(input_ptr + base_up_offset + t * stride_i_t, mask=mask, other=0.0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        gate = gate * (1.0 / (1.0 + tl.exp(-gate)))
 | 
				
			||||||
 | 
					        y = gate * up
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        y_s = tl.maximum(tl.max(tl.abs(y)), eps) / fp8_max
 | 
				
			||||||
 | 
					        if use_ue8m0:
 | 
				
			||||||
 | 
					            y_s = tl.exp2(tl.ceil(tl.log2(y_s)))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        tl.store(y_q_ptr + base_yq_offset + t * stride_yq_t, y_q, mask=mask)
 | 
				
			||||||
 | 
					        tl.store(y_s_ptr + base_ys_offset + t * stride_ys_t, y_s)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def silu_mul_fp8_quant_deep_gemm_triton(
 | 
				
			||||||
 | 
					    y: torch.Tensor,  # (E, T, 2*H)
 | 
				
			||||||
 | 
					    tokens_per_expert: torch.Tensor,  # (E,) number of valid tokens per expert
 | 
				
			||||||
 | 
					    num_parallel_tokens,
 | 
				
			||||||
 | 
					    group_size: int = 128,
 | 
				
			||||||
 | 
					    eps: float = 1e-10,
 | 
				
			||||||
 | 
					    expert_offsets: torch.Tensor = None,
 | 
				
			||||||
 | 
					) -> tuple[torch.Tensor, torch.Tensor]:
 | 
				
			||||||
 | 
					    """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    y has shape (E, T, 2*H). The first half of the last dimension is
 | 
				
			||||||
 | 
					    silu-activated, multiplied by the second half, then quantized into FP8.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Returns `(y_q, y_s)` where
 | 
				
			||||||
 | 
					    * `y_q`: FP8 tensor, shape (E, T, H), same layout as y[..., :H]
 | 
				
			||||||
 | 
					    * `y_s`: FP32 tensor, shape (E, T, H // group_size), strides (T*G, 1, T)
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    assert y.ndim == 3, "y must be (E, T, 2*H)"
 | 
				
			||||||
 | 
					    E, T, H2 = y.shape
 | 
				
			||||||
 | 
					    assert H2 % 2 == 0, "last dim of y must be even (2*H)"
 | 
				
			||||||
 | 
					    H = H2 // 2
 | 
				
			||||||
 | 
					    G = (H + group_size - 1) // group_size
 | 
				
			||||||
 | 
					    assert H % group_size == 0, "H must be divisible by group_size"
 | 
				
			||||||
 | 
					    assert tokens_per_expert.ndim == 1 and tokens_per_expert.shape[0] == E, (
 | 
				
			||||||
 | 
					        "tokens_per_expert must be shape (E,)"
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    tokens_per_expert = tokens_per_expert.to(device=y.device, dtype=torch.int32)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # allocate outputs
 | 
				
			||||||
 | 
					    fp8_dtype = torch.float8_e4m3fn
 | 
				
			||||||
 | 
					    y_q = torch.empty((E, T, H), dtype=fp8_dtype, device=y.device)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # strides (elements)
 | 
				
			||||||
 | 
					    stride_i_e, stride_i_t, stride_i_h = y.stride()
 | 
				
			||||||
 | 
					    stride_yq_e, stride_yq_t, stride_yq_h = y_q.stride()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # desired scale strides (elements): (T*G, 1, T)
 | 
				
			||||||
 | 
					    stride_ys_e = T * G
 | 
				
			||||||
 | 
					    stride_ys_t = 1
 | 
				
			||||||
 | 
					    stride_ys_g = T
 | 
				
			||||||
 | 
					    y_s = torch.empty_strided(
 | 
				
			||||||
 | 
					        (E, T, G),
 | 
				
			||||||
 | 
					        (stride_ys_e, stride_ys_t, stride_ys_g),
 | 
				
			||||||
 | 
					        dtype=torch.float32,
 | 
				
			||||||
 | 
					        device=y.device,
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    stride_cnt_e = tokens_per_expert.stride()[0]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Static grid over experts and H-groups.
 | 
				
			||||||
 | 
					    # A loop inside the kernel handles the token dim
 | 
				
			||||||
 | 
					    grid = (E * G,)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    f_info = torch.finfo(fp8_dtype)
 | 
				
			||||||
 | 
					    fp8_max = f_info.max
 | 
				
			||||||
 | 
					    fp8_min = f_info.min
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    _silu_mul_fp8_quant_deep_gemm[grid](
 | 
				
			||||||
 | 
					        y,
 | 
				
			||||||
 | 
					        y_q,
 | 
				
			||||||
 | 
					        y_s,
 | 
				
			||||||
 | 
					        tokens_per_expert,
 | 
				
			||||||
 | 
					        H,
 | 
				
			||||||
 | 
					        group_size,
 | 
				
			||||||
 | 
					        stride_i_e,
 | 
				
			||||||
 | 
					        stride_i_t,
 | 
				
			||||||
 | 
					        stride_i_h,
 | 
				
			||||||
 | 
					        stride_yq_e,
 | 
				
			||||||
 | 
					        stride_yq_t,
 | 
				
			||||||
 | 
					        stride_yq_h,
 | 
				
			||||||
 | 
					        stride_ys_e,
 | 
				
			||||||
 | 
					        stride_ys_t,
 | 
				
			||||||
 | 
					        stride_ys_g,
 | 
				
			||||||
 | 
					        stride_cnt_e,
 | 
				
			||||||
 | 
					        eps,
 | 
				
			||||||
 | 
					        fp8_min,
 | 
				
			||||||
 | 
					        fp8_max,
 | 
				
			||||||
 | 
					        is_deep_gemm_e8m0_used(),
 | 
				
			||||||
 | 
					        BLOCK=group_size,
 | 
				
			||||||
 | 
					        NUM_STAGES=4,
 | 
				
			||||||
 | 
					        num_warps=1,
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return y_q, y_s
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Parse generation strategies
 | 
				
			||||||
 | 
					strategies = ["random_imbalanced", "uniform", "max_t"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def benchmark(
 | 
				
			||||||
 | 
					    kernel: Callable,
 | 
				
			||||||
 | 
					    E: int,
 | 
				
			||||||
 | 
					    T: int,
 | 
				
			||||||
 | 
					    H: int,
 | 
				
			||||||
 | 
					    total_tokens: int,
 | 
				
			||||||
 | 
					    num_parallel_tokens: int = 64,
 | 
				
			||||||
 | 
					    G: int = 128,
 | 
				
			||||||
 | 
					    runs: int = 200,
 | 
				
			||||||
 | 
					    num_warmups: int = 20,
 | 
				
			||||||
 | 
					    gen_strategy: str = "default",
 | 
				
			||||||
 | 
					    iterations_per_run: int = 20,
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    def generate_data(seed_offset=0):
 | 
				
			||||||
 | 
					        """Generate input data with given seed offset"""
 | 
				
			||||||
 | 
					        current_platform.seed_everything(42 + seed_offset)
 | 
				
			||||||
 | 
					        y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if gen_strategy == "random_imbalanced":
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            def generate_expert_loads(n_e, total_tokens, ratio, device="cuda"):
 | 
				
			||||||
 | 
					                mean = total_tokens // n_e
 | 
				
			||||||
 | 
					                min_max = mean // ratio
 | 
				
			||||||
 | 
					                e = torch.ones(size=(E,), dtype=torch.int64, device=device) * mean
 | 
				
			||||||
 | 
					                e[0] = min_max
 | 
				
			||||||
 | 
					                r = torch.rand(size=(E - 1,))
 | 
				
			||||||
 | 
					                r /= r.sum()
 | 
				
			||||||
 | 
					                r *= total_tokens - min_max
 | 
				
			||||||
 | 
					                r = r.round().long()
 | 
				
			||||||
 | 
					                e[1:] = r.to(device=device)
 | 
				
			||||||
 | 
					                return e
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            tokens_per_expert = generate_expert_loads(E, total_tokens, 0.7, "cuda")
 | 
				
			||||||
 | 
					        elif gen_strategy == "uniform":
 | 
				
			||||||
 | 
					            r = torch.rand(size=(E,))
 | 
				
			||||||
 | 
					            r /= r.sum()
 | 
				
			||||||
 | 
					            r *= total_tokens
 | 
				
			||||||
 | 
					            r = r.round().long()
 | 
				
			||||||
 | 
					            tokens_per_expert = r
 | 
				
			||||||
 | 
					        elif gen_strategy == "max_t":
 | 
				
			||||||
 | 
					            tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda")
 | 
				
			||||||
 | 
					            tokens_per_expert.fill_(total_tokens / E)
 | 
				
			||||||
 | 
					        elif gen_strategy == "first_t":
 | 
				
			||||||
 | 
					            tokens_per_expert = torch.zeros(size=(E,), dtype=torch.int32, device="cuda")
 | 
				
			||||||
 | 
					            tokens_per_expert[0] = min(T, total_tokens)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            raise ValueError(f"Unknown generation strategy: {gen_strategy}")
 | 
				
			||||||
 | 
					        return y, tokens_per_expert
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    dataset_count = 4
 | 
				
			||||||
 | 
					    # Pre-generate different input matrices for each iteration to avoid cache effects
 | 
				
			||||||
 | 
					    data_sets = [generate_data(i) for i in range(dataset_count)]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Warmup
 | 
					    # Warmup
 | 
				
			||||||
    for _ in range(10):
 | 
					    y, tokens_per_expert = data_sets[0]
 | 
				
			||||||
        silu_mul_fp8_quant_deep_gemm(y, tokens_per_expert, group_size=G)
 | 
					    for _ in range(num_warmups):
 | 
				
			||||||
 | 
					        kernel(
 | 
				
			||||||
 | 
					            y, tokens_per_expert, num_parallel_tokens=num_parallel_tokens, group_size=G
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
    torch.cuda.synchronize()
 | 
					    torch.cuda.synchronize()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    start_event = torch.cuda.Event(enable_timing=True)
 | 
				
			||||||
 | 
					    end_event = torch.cuda.Event(enable_timing=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Benchmark
 | 
					    # Benchmark
 | 
				
			||||||
    torch.cuda.synchronize()
 | 
					    latencies: list[float] = []
 | 
				
			||||||
    start = time.perf_counter()
 | 
					 | 
				
			||||||
    for _ in range(runs):
 | 
					    for _ in range(runs):
 | 
				
			||||||
        silu_mul_fp8_quant_deep_gemm(y, tokens_per_expert, group_size=G)
 | 
					 | 
				
			||||||
        torch.cuda.synchronize()
 | 
					        torch.cuda.synchronize()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    avg_time = (time.perf_counter() - start) / runs * 1000
 | 
					        start_event.record()
 | 
				
			||||||
 | 
					        for i in range(iterations_per_run):
 | 
				
			||||||
 | 
					            y, tokens_per_expert = data_sets[i % dataset_count]
 | 
				
			||||||
 | 
					            kernel(
 | 
				
			||||||
 | 
					                y,
 | 
				
			||||||
 | 
					                tokens_per_expert,
 | 
				
			||||||
 | 
					                num_parallel_tokens=num_parallel_tokens,
 | 
				
			||||||
 | 
					                group_size=G,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					        end_event.record()
 | 
				
			||||||
 | 
					        end_event.synchronize()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Calculate actual work done (only count valid tokens)
 | 
					        total_time_ms = start_event.elapsed_time(end_event)
 | 
				
			||||||
 | 
					        per_iter_time_ms = total_time_ms / iterations_per_run
 | 
				
			||||||
 | 
					        latencies.append(per_iter_time_ms)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Use median instead of average for better outlier handling
 | 
				
			||||||
 | 
					    median_time_ms = np.median(latencies)
 | 
				
			||||||
 | 
					    median_time_s = median_time_ms / 1000
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Calculate actual work done (using first dataset for consistency)
 | 
				
			||||||
 | 
					    _, tokens_per_expert = data_sets[0]
 | 
				
			||||||
    actual_tokens = tokens_per_expert.sum().item()
 | 
					    actual_tokens = tokens_per_expert.sum().item()
 | 
				
			||||||
    actual_elements = actual_tokens * H
 | 
					    actual_elements = actual_tokens * H
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # GFLOPS: operations per element = exp + 3 muls + 1 div + quantization ops ≈ 8 ops
 | 
					    # GFLOPS: operations per element = exp + 3 muls + 1 div + quantization ops ≈ 8 ops
 | 
				
			||||||
    ops_per_element = 8
 | 
					    ops_per_element = 8
 | 
				
			||||||
    total_ops = actual_elements * ops_per_element
 | 
					    total_ops = actual_elements * ops_per_element
 | 
				
			||||||
    gflops = total_ops / (avg_time / 1000) / 1e9
 | 
					    gflops = total_ops / median_time_s / 1e9
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Memory bandwidth: bfloat16 inputs (2 bytes), fp8 output (1 byte), scales (4 bytes)
 | 
					    # Memory bandwidth: bfloat16 inputs (2 bytes), fp8 output (1 byte), scales (4 bytes)
 | 
				
			||||||
    input_bytes = actual_tokens * 2 * H * 2  # 2*H bfloat16 inputs
 | 
					    input_bytes = actual_tokens * 2 * H * 2  # 2*H bfloat16 inputs
 | 
				
			||||||
    output_bytes = actual_tokens * H * 1  # H fp8 outputs
 | 
					    output_bytes = actual_tokens * H * 1  # H fp8 outputs
 | 
				
			||||||
    scale_bytes = actual_tokens * (H // G) * 4  # scales in float32
 | 
					    scale_bytes = actual_tokens * (H // G) * 4  # scales in float32
 | 
				
			||||||
    total_bytes = input_bytes + output_bytes + scale_bytes
 | 
					    total_bytes = input_bytes + output_bytes + scale_bytes
 | 
				
			||||||
    memory_bw = total_bytes / (avg_time / 1000) / 1e9
 | 
					    memory_bw = total_bytes / median_time_s / 1e9
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return avg_time, gflops, memory_bw
 | 
					    HOPPER_BANDWIDTH_TBPS = 3.35
 | 
				
			||||||
 | 
					    return (
 | 
				
			||||||
 | 
					        median_time_ms,
 | 
				
			||||||
 | 
					        gflops,
 | 
				
			||||||
 | 
					        memory_bw,
 | 
				
			||||||
 | 
					        (memory_bw / (HOPPER_BANDWIDTH_TBPS * 1024)) * 100,
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def create_comparison_plot(
 | 
				
			||||||
 | 
					    ratios, silu_v2_times, triton_times, config_labels, strategy_name, id
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    fig, ax = plt.subplots(1, 1, figsize=(18, 6))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Configure x-axis positions
 | 
				
			||||||
 | 
					    x = np.arange(len(config_labels))
 | 
				
			||||||
 | 
					    width = 0.25
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Execution Time plot (lower is better)
 | 
				
			||||||
 | 
					    ax.bar(x, silu_v2_times, width, label="SiLU V2 (CUDA)", alpha=0.8, color="blue")
 | 
				
			||||||
 | 
					    ax.bar(
 | 
				
			||||||
 | 
					        x + width, triton_times, width, label="Triton Kernel", alpha=0.8, color="green"
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Add speedup labels over each bar trio
 | 
				
			||||||
 | 
					    for i in range(len(x)):
 | 
				
			||||||
 | 
					        triton_v2_speedup = ratios[i][1]  # triton/v2
 | 
				
			||||||
 | 
					        max_height = max(silu_v2_times[i], triton_times[i])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Triton/V2 speedup
 | 
				
			||||||
 | 
					        ax.text(
 | 
				
			||||||
 | 
					            x[i] + width / 2,
 | 
				
			||||||
 | 
					            max_height + max_height * 0.02,
 | 
				
			||||||
 | 
					            f"{triton_v2_speedup:.2f}x",
 | 
				
			||||||
 | 
					            ha="center",
 | 
				
			||||||
 | 
					            va="bottom",
 | 
				
			||||||
 | 
					            fontweight="bold",
 | 
				
			||||||
 | 
					            fontsize=8,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    ax.set_xlabel("Configuration")
 | 
				
			||||||
 | 
					    ax.set_ylabel("% Utilization")
 | 
				
			||||||
 | 
					    ax.set_title(
 | 
				
			||||||
 | 
					        f"Memory Bandwidth Utilization (%) - {strategy_name}\n(Higher is Better)"
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    ax.set_xticks(x)
 | 
				
			||||||
 | 
					    ax.set_xticklabels(config_labels, rotation=45, ha="right")
 | 
				
			||||||
 | 
					    ax.legend()
 | 
				
			||||||
 | 
					    ax.grid(True, alpha=0.3)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    plt.tight_layout()
 | 
				
			||||||
 | 
					    return fig, ax
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def create_combined_plot(all_results):
 | 
				
			||||||
 | 
					    num_strategies = len(all_results)
 | 
				
			||||||
 | 
					    fig, axes = plt.subplots(num_strategies, 1, figsize=(22, 7 * num_strategies))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    if num_strategies == 1:
 | 
				
			||||||
 | 
					        axes = [axes]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for idx, (
 | 
				
			||||||
 | 
					        strategy_name,
 | 
				
			||||||
 | 
					        all_ratios,
 | 
				
			||||||
 | 
					        all_silu_v2_results,
 | 
				
			||||||
 | 
					        all_triton_results,
 | 
				
			||||||
 | 
					        config_labels,
 | 
				
			||||||
 | 
					        config_x_axis,
 | 
				
			||||||
 | 
					    ) in enumerate(all_results):
 | 
				
			||||||
 | 
					        ax = axes[idx]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Flatten the nested results to get bandwidth percentages for plotting
 | 
				
			||||||
 | 
					        silu_v2_bandwidths = []
 | 
				
			||||||
 | 
					        triton_bandwidths = []
 | 
				
			||||||
 | 
					        flat_ratios = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for config_results in all_silu_v2_results:
 | 
				
			||||||
 | 
					            for result in config_results:
 | 
				
			||||||
 | 
					                silu_v2_bandwidths.append(result[3])  # bandwidth percentage
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for config_results in all_triton_results:
 | 
				
			||||||
 | 
					            for result in config_results:
 | 
				
			||||||
 | 
					                triton_bandwidths.append(result[3])  # bandwidth percentage
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for config_ratios in all_ratios:
 | 
				
			||||||
 | 
					            for ratio in config_ratios:
 | 
				
			||||||
 | 
					                flat_ratios.append(ratio)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Configure x-axis positions
 | 
				
			||||||
 | 
					        x = np.arange(len(config_labels))
 | 
				
			||||||
 | 
					        width = 0.25
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Bandwidth utilization plot (higher is better)
 | 
				
			||||||
 | 
					        ax.bar(
 | 
				
			||||||
 | 
					            x,
 | 
				
			||||||
 | 
					            silu_v2_bandwidths,
 | 
				
			||||||
 | 
					            width,
 | 
				
			||||||
 | 
					            label="SiLU V2 (CUDA)",
 | 
				
			||||||
 | 
					            alpha=0.8,
 | 
				
			||||||
 | 
					            color="blue",
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        ax.bar(
 | 
				
			||||||
 | 
					            x + width,
 | 
				
			||||||
 | 
					            triton_bandwidths,
 | 
				
			||||||
 | 
					            width,
 | 
				
			||||||
 | 
					            label="Triton Kernel",
 | 
				
			||||||
 | 
					            alpha=0.8,
 | 
				
			||||||
 | 
					            color="green",
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Add speedup labels over each bar trio
 | 
				
			||||||
 | 
					        for i in range(len(x)):
 | 
				
			||||||
 | 
					            triton_v2_speedup = flat_ratios[i]  # triton/v2
 | 
				
			||||||
 | 
					            max_height = max(silu_v2_bandwidths[i], triton_bandwidths[i])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Triton/V2 speedup
 | 
				
			||||||
 | 
					            ax.text(
 | 
				
			||||||
 | 
					                x[i] + width / 2,
 | 
				
			||||||
 | 
					                max_height + max_height * 0.02,
 | 
				
			||||||
 | 
					                f"{triton_v2_speedup:.2f}x",
 | 
				
			||||||
 | 
					                ha="center",
 | 
				
			||||||
 | 
					                va="bottom",
 | 
				
			||||||
 | 
					                fontweight="bold",
 | 
				
			||||||
 | 
					                fontsize=8,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        ax.set_xlabel("Configuration")
 | 
				
			||||||
 | 
					        ax.set_ylabel("% Utilization")
 | 
				
			||||||
 | 
					        ax.set_title(
 | 
				
			||||||
 | 
					            f"Memory Bandwidth Utilization (%) - {strategy_name}\n(Higher is Better)"
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        ax.set_xticks(x)
 | 
				
			||||||
 | 
					        ax.set_xticklabels(config_labels, rotation=45, ha="right")
 | 
				
			||||||
 | 
					        ax.legend()
 | 
				
			||||||
 | 
					        ax.grid(True, alpha=0.3)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    plt.tight_layout()
 | 
				
			||||||
 | 
					    filename = "silu_benchmark_combined_3way.png"
 | 
				
			||||||
 | 
					    plt.savefig(filename, dpi=300, bbox_inches="tight")
 | 
				
			||||||
 | 
					    plt.show()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return filename
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					outer_dim = 7168
 | 
				
			||||||
configs = [
 | 
					configs = [
 | 
				
			||||||
    (8, 32, 1024),
 | 
					 | 
				
			||||||
    (16, 64, 2048),
 | 
					 | 
				
			||||||
    (32, 128, 4096),
 | 
					 | 
				
			||||||
    # DeepSeekV3 Configs
 | 
					    # DeepSeekV3 Configs
 | 
				
			||||||
    (256, 16, 7168),
 | 
					    # (1, 56, 7168),
 | 
				
			||||||
    (256, 32, 7168),
 | 
					    (8, 1024, 7168),
 | 
				
			||||||
    (256, 64, 7168),
 | 
					    # (32, 56, 7168),
 | 
				
			||||||
    (256, 128, 7168),
 | 
					    # DeepSeekV3 Configs
 | 
				
			||||||
    (256, 256, 7168),
 | 
					    (32, 1024, 7168),
 | 
				
			||||||
    (256, 512, 7168),
 | 
					    # DeepSeekV3 Configs
 | 
				
			||||||
    (256, 1024, 7168),
 | 
					    (256, 1024, 7168),
 | 
				
			||||||
]
 | 
					]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
print(f"GPU: {torch.cuda.get_device_name()}")
 | 
					runs = 100
 | 
				
			||||||
print(f"{'Config':<20} {'Time(ms)':<10} {'GFLOPS':<10} {'GB/s':<10}")
 | 
					num_warmups = 20
 | 
				
			||||||
print("-" * 50)
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
for E, T, H in configs:
 | 
					strategy_descriptions = {
 | 
				
			||||||
    try:
 | 
					    "uniform": "Uniform Random",
 | 
				
			||||||
        time_ms, gflops, gbps = benchmark(E, T, H)
 | 
					    "random_imbalanced": "Imbalanced Random",
 | 
				
			||||||
        print(f"E={E:3d},T={T:4d},H={H:4d} {time_ms:8.3f} {gflops:8.1f} {gbps:8.1f}")
 | 
					    "max_t": "Even Assignment",
 | 
				
			||||||
    except Exception:
 | 
					    "first_t": "experts[0] = T, experts[1:] = 0",
 | 
				
			||||||
        print(f"E={E:3d},T={T:4d},H={H:4d} FAILED")
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					print(f"GPU: {torch.cuda.get_device_name()}")
 | 
				
			||||||
 | 
					print(f"Testing strategies: {', '.join(strategies)}")
 | 
				
			||||||
 | 
					print(f"Configurations: {len(configs)} configs")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					all_results = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Run benchmarks for each strategy
 | 
				
			||||||
 | 
					for id, strategy in enumerate(strategies):
 | 
				
			||||||
 | 
					    print(f"\n{'=' * 60}")
 | 
				
			||||||
 | 
					    print(f"Testing strategy: {strategy_descriptions[strategy]}")
 | 
				
			||||||
 | 
					    print(f"{'=' * 60}")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Collect benchmark data for all three algorithms
 | 
				
			||||||
 | 
					    config_labels = []
 | 
				
			||||||
 | 
					    config_x_axis = []
 | 
				
			||||||
 | 
					    all_silu_v2_results = []
 | 
				
			||||||
 | 
					    all_triton_results = []
 | 
				
			||||||
 | 
					    all_ratios = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for E, T, H in configs:
 | 
				
			||||||
 | 
					        total_tokens_config = []
 | 
				
			||||||
 | 
					        for i in [8, 16, 32, 64, 128, 256, 512]:
 | 
				
			||||||
 | 
					            if i <= T:
 | 
				
			||||||
 | 
					                total_tokens_config.append(i * E)
 | 
				
			||||||
 | 
					        config_x_axis.append(total_tokens_config)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        silu_v2_results = []
 | 
				
			||||||
 | 
					        triton_results = []
 | 
				
			||||||
 | 
					        ratios = []
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for total_tokens in total_tokens_config:
 | 
				
			||||||
 | 
					            config_label = f"E={E},T={T},H={H},TT={total_tokens}"
 | 
				
			||||||
 | 
					            config_labels.append(config_label)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # SiLU V2 (CUDA kernel) results
 | 
				
			||||||
 | 
					            time_ms_silu_v2, gflops, gbps, perc = benchmark(
 | 
				
			||||||
 | 
					                persistent_masked_m_silu_mul_quant,
 | 
				
			||||||
 | 
					                E,
 | 
				
			||||||
 | 
					                T,
 | 
				
			||||||
 | 
					                H,
 | 
				
			||||||
 | 
					                total_tokens,
 | 
				
			||||||
 | 
					                runs=runs,
 | 
				
			||||||
 | 
					                num_warmups=num_warmups,
 | 
				
			||||||
 | 
					                gen_strategy=strategy,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            silu_v2_results.append((time_ms_silu_v2, gflops, gbps, perc))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Triton kernel results
 | 
				
			||||||
 | 
					            time_ms_triton, gflops, gbps, perc = benchmark(
 | 
				
			||||||
 | 
					                silu_mul_fp8_quant_deep_gemm_triton,
 | 
				
			||||||
 | 
					                E,
 | 
				
			||||||
 | 
					                T,
 | 
				
			||||||
 | 
					                H,
 | 
				
			||||||
 | 
					                total_tokens,
 | 
				
			||||||
 | 
					                runs=runs,
 | 
				
			||||||
 | 
					                num_warmups=num_warmups,
 | 
				
			||||||
 | 
					                gen_strategy=strategy,
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            triton_results.append((time_ms_triton, gflops, gbps, perc))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Calculate speedup ratios (triton baseline / implementation)
 | 
				
			||||||
 | 
					            triton_v2_ratio = time_ms_triton / time_ms_silu_v2
 | 
				
			||||||
 | 
					            ratios.append(triton_v2_ratio)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            print(
 | 
				
			||||||
 | 
					                f"Completed: {config_label}:"
 | 
				
			||||||
 | 
					                f" V2: {time_ms_silu_v2:.3f}ms,"
 | 
				
			||||||
 | 
					                f" Triton: {time_ms_triton:.3f}ms"
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        all_silu_v2_results.append(silu_v2_results)
 | 
				
			||||||
 | 
					        all_triton_results.append(triton_results)
 | 
				
			||||||
 | 
					        all_ratios.append(ratios)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Store results for combined plotting
 | 
				
			||||||
 | 
					    all_results.append(
 | 
				
			||||||
 | 
					        (
 | 
				
			||||||
 | 
					            strategy_descriptions[strategy],
 | 
				
			||||||
 | 
					            all_ratios,
 | 
				
			||||||
 | 
					            all_silu_v2_results,
 | 
				
			||||||
 | 
					            all_triton_results,
 | 
				
			||||||
 | 
					            config_labels,
 | 
				
			||||||
 | 
					            config_x_axis,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Print summary table for this strategy
 | 
				
			||||||
 | 
					    print(f"\nSummary Table - {strategy_descriptions[strategy]}:")
 | 
				
			||||||
 | 
					    print(f" {'V2 Time(ms)':<12} {'Triton Time(ms)':<14} {'Triton/V2':<10}")
 | 
				
			||||||
 | 
					    print("-" * 90)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for i, (E, T, H) in enumerate(configs):
 | 
				
			||||||
 | 
					        # Get the first result for each config (simplifying for summary)
 | 
				
			||||||
 | 
					        v2_time = silu_v2_results[i][0]
 | 
				
			||||||
 | 
					        triton_time = triton_results[i][0]
 | 
				
			||||||
 | 
					        triton_v2_speedup = triton_time / v2_time
 | 
				
			||||||
 | 
					        config_label = f"E={E:3d},T={T:4d},H={H:4d}"
 | 
				
			||||||
 | 
					        print(
 | 
				
			||||||
 | 
					            f"{config_label:<20} {v2_time:8.5f} {triton_time:10.5f} "
 | 
				
			||||||
 | 
					            f"{triton_v2_speedup:8.2f}x"
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def create_total_tokens_plot(all_results):
 | 
				
			||||||
 | 
					    num_strategies = len(all_results)
 | 
				
			||||||
 | 
					    num_configs = len(configs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    fig, axs = plt.subplots(
 | 
				
			||||||
 | 
					        num_strategies, num_configs * 2, figsize=(32, 8 * num_strategies)
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Add main title to the entire figure
 | 
				
			||||||
 | 
					    fig.suptitle(
 | 
				
			||||||
 | 
					        "Performance Analysis: Speedup vs Bandwidth Utilization (SiLU V2, and Triton)",
 | 
				
			||||||
 | 
					        fontsize=18,
 | 
				
			||||||
 | 
					        fontweight="bold",
 | 
				
			||||||
 | 
					        y=0.98,
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Handle single strategy case
 | 
				
			||||||
 | 
					    if num_strategies == 1:
 | 
				
			||||||
 | 
					        axs = axs.reshape(1, -1)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # Handle single config case
 | 
				
			||||||
 | 
					    if num_configs == 1:
 | 
				
			||||||
 | 
					        axs = axs.reshape(-1, 2)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for strategy_idx, result in enumerate(all_results):
 | 
				
			||||||
 | 
					        (
 | 
				
			||||||
 | 
					            strategy_name,
 | 
				
			||||||
 | 
					            all_ratios,
 | 
				
			||||||
 | 
					            all_silu_v2_results,
 | 
				
			||||||
 | 
					            all_triton_results,
 | 
				
			||||||
 | 
					            config_labels,
 | 
				
			||||||
 | 
					            config_x_axis,
 | 
				
			||||||
 | 
					        ) = result
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for config_idx in range(num_configs):
 | 
				
			||||||
 | 
					            # Speedup plot (left column)
 | 
				
			||||||
 | 
					            ax_speedup = axs[strategy_idx, config_idx * 2]
 | 
				
			||||||
 | 
					            # Bandwidth plot (right column)
 | 
				
			||||||
 | 
					            ax_bandwidth = axs[strategy_idx, config_idx * 2 + 1]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            E, T, H = configs[config_idx]
 | 
				
			||||||
 | 
					            ratios = all_ratios[config_idx]
 | 
				
			||||||
 | 
					            total_tokens_values = config_x_axis[config_idx]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Extract speedup ratios
 | 
				
			||||||
 | 
					            triton_v2_ratios = [ratio for ratio in ratios]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Extract bandwidth percentages for all implementations
 | 
				
			||||||
 | 
					            v2_bandwidth_percentages = [
 | 
				
			||||||
 | 
					                result[3] for result in all_silu_v2_results[config_idx]
 | 
				
			||||||
 | 
					            ]
 | 
				
			||||||
 | 
					            triton_bandwidth_percentages = [
 | 
				
			||||||
 | 
					                result[3] for result in all_triton_results[config_idx]
 | 
				
			||||||
 | 
					            ]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Plot speedup ratios vs total tokens (left plot)
 | 
				
			||||||
 | 
					            ax_speedup.plot(
 | 
				
			||||||
 | 
					                total_tokens_values,
 | 
				
			||||||
 | 
					                triton_v2_ratios,
 | 
				
			||||||
 | 
					                "go-",
 | 
				
			||||||
 | 
					                linewidth=3,
 | 
				
			||||||
 | 
					                markersize=8,
 | 
				
			||||||
 | 
					                label="Triton/V2 Speedup",
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            ax_speedup.set_title(
 | 
				
			||||||
 | 
					                f"{strategy_name}\nSpeedup vs Baseline (Triton)\nE={E}, T={T}, H={H}",
 | 
				
			||||||
 | 
					                fontsize=12,
 | 
				
			||||||
 | 
					                fontweight="bold",
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
 | 
				
			||||||
 | 
					            ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11)
 | 
				
			||||||
 | 
					            ax_speedup.legend(prop={"weight": "bold"})
 | 
				
			||||||
 | 
					            ax_speedup.grid(True, alpha=0.3)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Plot bandwidth utilization (right plot)
 | 
				
			||||||
 | 
					            ax_bandwidth.plot(
 | 
				
			||||||
 | 
					                total_tokens_values,
 | 
				
			||||||
 | 
					                v2_bandwidth_percentages,
 | 
				
			||||||
 | 
					                "o-",
 | 
				
			||||||
 | 
					                linewidth=3,
 | 
				
			||||||
 | 
					                markersize=8,
 | 
				
			||||||
 | 
					                label="SiLU V2",
 | 
				
			||||||
 | 
					                color="blue",
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            ax_bandwidth.plot(
 | 
				
			||||||
 | 
					                total_tokens_values,
 | 
				
			||||||
 | 
					                triton_bandwidth_percentages,
 | 
				
			||||||
 | 
					                "o-",
 | 
				
			||||||
 | 
					                linewidth=3,
 | 
				
			||||||
 | 
					                markersize=8,
 | 
				
			||||||
 | 
					                label="Triton",
 | 
				
			||||||
 | 
					                color="green",
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            ax_bandwidth.set_title(
 | 
				
			||||||
 | 
					                f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}",
 | 
				
			||||||
 | 
					                fontsize=12,
 | 
				
			||||||
 | 
					                fontweight="bold",
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            ax_bandwidth.set_xlabel("Total Tokens", fontweight="bold", fontsize=11)
 | 
				
			||||||
 | 
					            ax_bandwidth.set_ylabel(
 | 
				
			||||||
 | 
					                "% of Peak Bandwidth", fontweight="bold", fontsize=11
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					            ax_bandwidth.legend(prop={"weight": "bold"})
 | 
				
			||||||
 | 
					            ax_bandwidth.grid(True, alpha=0.3)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Format x-axis labels for both plots
 | 
				
			||||||
 | 
					            for ax in [ax_speedup, ax_bandwidth]:
 | 
				
			||||||
 | 
					                ax.set_xticks(total_tokens_values)
 | 
				
			||||||
 | 
					                ax.set_xticklabels(
 | 
				
			||||||
 | 
					                    [
 | 
				
			||||||
 | 
					                        f"{tt // 1000}K" if tt >= 1000 else str(tt)
 | 
				
			||||||
 | 
					                        for tt in total_tokens_values
 | 
				
			||||||
 | 
					                    ],
 | 
				
			||||||
 | 
					                    fontweight="bold",
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					                # Make tick labels bold
 | 
				
			||||||
 | 
					                for label in ax.get_xticklabels() + ax.get_yticklabels():
 | 
				
			||||||
 | 
					                    label.set_fontweight("bold")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Add value labels on Triton/V2 speedup points
 | 
				
			||||||
 | 
					            for x, y in zip(total_tokens_values, triton_v2_ratios):
 | 
				
			||||||
 | 
					                ax_speedup.annotate(
 | 
				
			||||||
 | 
					                    f"{y:.2f}x",
 | 
				
			||||||
 | 
					                    (x, y),
 | 
				
			||||||
 | 
					                    textcoords="offset points",
 | 
				
			||||||
 | 
					                    xytext=(0, -15),
 | 
				
			||||||
 | 
					                    ha="center",
 | 
				
			||||||
 | 
					                    fontsize=9,
 | 
				
			||||||
 | 
					                    fontweight="bold",
 | 
				
			||||||
 | 
					                    bbox=dict(boxstyle="round,pad=0.2", facecolor="green", alpha=0.3),
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    plt.tight_layout()
 | 
				
			||||||
 | 
					    plt.subplots_adjust(top=0.93)  # Make room for main title
 | 
				
			||||||
 | 
					    filename = "silu_benchmark_total_tokens_3way.png"
 | 
				
			||||||
 | 
					    plt.savefig(filename, dpi=300, bbox_inches="tight")
 | 
				
			||||||
 | 
					    plt.show()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return filename
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# Create comprehensive 3-way comparison plots
 | 
				
			||||||
 | 
					combined_plot_filename = create_combined_plot(all_results)
 | 
				
			||||||
 | 
					total_tokens_plot_filename = create_total_tokens_plot(all_results)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					print(f"\n{'=' * 80}")
 | 
				
			||||||
 | 
					print("3-Way Benchmark Suite Complete!")
 | 
				
			||||||
 | 
					print(f"Generated combined comparison plot: {combined_plot_filename}")
 | 
				
			||||||
 | 
					print(f"Generated total tokens analysis plot: {total_tokens_plot_filename}")
 | 
				
			||||||
 | 
					print("Compared: SiLU V2 (CUDA), and Triton implementations")
 | 
				
			||||||
 | 
					print(f"{'=' * 80}")
 | 
				
			||||||
 | 
				
			|||||||
@ -4,7 +4,6 @@
 | 
				
			|||||||
import csv
 | 
					import csv
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
from datetime import datetime
 | 
					from datetime import datetime
 | 
				
			||||||
from typing import Optional
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
import flashinfer
 | 
					import flashinfer
 | 
				
			||||||
import torch
 | 
					import torch
 | 
				
			||||||
@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
 | 
				
			|||||||
@torch.no_grad()
 | 
					@torch.no_grad()
 | 
				
			||||||
def benchmark_decode(
 | 
					def benchmark_decode(
 | 
				
			||||||
    dtype: torch.dtype,
 | 
					    dtype: torch.dtype,
 | 
				
			||||||
    quant_dtypes: tuple[
 | 
					    quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
 | 
				
			||||||
        Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
 | 
					 | 
				
			||||||
    ],
 | 
					 | 
				
			||||||
    batch_size: int,
 | 
					    batch_size: int,
 | 
				
			||||||
    max_seq_len: int,
 | 
					    max_seq_len: int,
 | 
				
			||||||
    num_heads: tuple[int, int] = (64, 8),
 | 
					    num_heads: tuple[int, int] = (64, 8),
 | 
				
			||||||
 | 
				
			|||||||
@ -4,7 +4,6 @@
 | 
				
			|||||||
import csv
 | 
					import csv
 | 
				
			||||||
import os
 | 
					import os
 | 
				
			||||||
from datetime import datetime
 | 
					from datetime import datetime
 | 
				
			||||||
from typing import Optional
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
import flashinfer
 | 
					import flashinfer
 | 
				
			||||||
import torch
 | 
					import torch
 | 
				
			||||||
@ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn):
 | 
				
			|||||||
@torch.no_grad()
 | 
					@torch.no_grad()
 | 
				
			||||||
def benchmark_prefill(
 | 
					def benchmark_prefill(
 | 
				
			||||||
    dtype: torch.dtype,
 | 
					    dtype: torch.dtype,
 | 
				
			||||||
    quant_dtypes: tuple[
 | 
					    quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None],
 | 
				
			||||||
        Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype]
 | 
					 | 
				
			||||||
    ],
 | 
					 | 
				
			||||||
    batch_size: int,
 | 
					    batch_size: int,
 | 
				
			||||||
    max_seq_len: int,
 | 
					    max_seq_len: int,
 | 
				
			||||||
    num_heads: tuple[int, int] = (64, 8),
 | 
					    num_heads: tuple[int, int] = (64, 8),
 | 
				
			||||||
 | 
				
			|||||||
@ -11,13 +11,13 @@ from datetime import datetime
 | 
				
			|||||||
from typing import Any
 | 
					from typing import Any
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import torch
 | 
					import torch
 | 
				
			||||||
import triton
 | 
					 | 
				
			||||||
from tqdm import tqdm
 | 
					from tqdm import tqdm
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
 | 
					from vllm.model_executor.layers.quantization.utils.fp8_utils import (
 | 
				
			||||||
    _w8a8_block_fp8_matmul,
 | 
					    _w8a8_triton_block_scaled_mm,
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
from vllm.platforms import current_platform
 | 
					from vllm.platforms import current_platform
 | 
				
			||||||
 | 
					from vllm.triton_utils import triton
 | 
				
			||||||
from vllm.utils import FlexibleArgumentParser
 | 
					from vllm.utils import FlexibleArgumentParser
 | 
				
			||||||
 | 
					
 | 
				
			||||||
mp.set_start_method("spawn", force=True)
 | 
					mp.set_start_method("spawn", force=True)
 | 
				
			||||||
@ -56,7 +56,7 @@ def w8a8_block_matmul(
 | 
				
			|||||||
        Bs: The per-block quantization scale for `B`.
 | 
					        Bs: The per-block quantization scale for `B`.
 | 
				
			||||||
        block_size: The block size for per-block quantization.
 | 
					        block_size: The block size for per-block quantization.
 | 
				
			||||||
                    It should be 2-dim, e.g., [128, 128].
 | 
					                    It should be 2-dim, e.g., [128, 128].
 | 
				
			||||||
        output_dytpe: The dtype of the returned tensor.
 | 
					        output_dtype: The dtype of the returned tensor.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    Returns:
 | 
					    Returns:
 | 
				
			||||||
        torch.Tensor: The result of matmul.
 | 
					        torch.Tensor: The result of matmul.
 | 
				
			||||||
@ -83,7 +83,7 @@ def w8a8_block_matmul(
 | 
				
			|||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    if A.dtype == torch.float8_e4m3fn:
 | 
					    if A.dtype == torch.float8_e4m3fn:
 | 
				
			||||||
        kernel = _w8a8_block_fp8_matmul
 | 
					        kernel = _w8a8_triton_block_scaled_mm
 | 
				
			||||||
    else:
 | 
					    else:
 | 
				
			||||||
        raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.")
 | 
					        raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -1,6 +1,5 @@
 | 
				
			|||||||
# SPDX-License-Identifier: Apache-2.0
 | 
					# SPDX-License-Identifier: Apache-2.0
 | 
				
			||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
					# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
				
			||||||
# fmt: off
 | 
					 | 
				
			||||||
# ruff: noqa: E501
 | 
					# ruff: noqa: E501
 | 
				
			||||||
import time
 | 
					import time
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -8,27 +7,33 @@ import torch
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
from vllm import _custom_ops as ops
 | 
					from vllm import _custom_ops as ops
 | 
				
			||||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
 | 
					from vllm.model_executor.layers.quantization.utils.fp8_utils import (
 | 
				
			||||||
    get_col_major_tma_aligned_tensor,
 | 
					 | 
				
			||||||
    per_token_group_quant_fp8,
 | 
					    per_token_group_quant_fp8,
 | 
				
			||||||
    w8a8_block_fp8_matmul,
 | 
					    w8a8_triton_block_scaled_mm,
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
from vllm.triton_utils import triton
 | 
					from vllm.triton_utils import triton
 | 
				
			||||||
from vllm.utils.deep_gemm import calc_diff, fp8_gemm_nt, per_block_cast_to_fp8
 | 
					from vllm.utils.deep_gemm import (
 | 
				
			||||||
 | 
					    calc_diff,
 | 
				
			||||||
 | 
					    fp8_gemm_nt,
 | 
				
			||||||
 | 
					    get_col_major_tma_aligned_tensor,
 | 
				
			||||||
 | 
					    per_block_cast_to_fp8,
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def benchmark_shape(m: int,
 | 
					def benchmark_shape(
 | 
				
			||||||
 | 
					    m: int,
 | 
				
			||||||
    n: int,
 | 
					    n: int,
 | 
				
			||||||
    k: int,
 | 
					    k: int,
 | 
				
			||||||
    warmup: int = 100,
 | 
					    warmup: int = 100,
 | 
				
			||||||
    repeat: int = 10000,
 | 
					    repeat: int = 10000,
 | 
				
			||||||
                    verbose: bool = False) -> dict:
 | 
					    verbose: bool = False,
 | 
				
			||||||
 | 
					) -> dict:
 | 
				
			||||||
    """Benchmark all implementations for a specific (m, n, k) shape."""
 | 
					    """Benchmark all implementations for a specific (m, n, k) shape."""
 | 
				
			||||||
    if verbose:
 | 
					    if verbose:
 | 
				
			||||||
        print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===")
 | 
					        print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Create test tensors
 | 
					    # Create test tensors
 | 
				
			||||||
    A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16)
 | 
					    A = torch.randn((m, k), device="cuda", dtype=torch.bfloat16)
 | 
				
			||||||
    B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16)
 | 
					    B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Reference result in BF16
 | 
					    # Reference result in BF16
 | 
				
			||||||
    torch.cuda.synchronize()
 | 
					    torch.cuda.synchronize()
 | 
				
			||||||
@ -45,34 +50,39 @@ def benchmark_shape(m: int,
 | 
				
			|||||||
    # Pre-quantize A for all implementations
 | 
					    # Pre-quantize A for all implementations
 | 
				
			||||||
    A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
 | 
					    A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
 | 
				
			||||||
    A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
 | 
					    A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
 | 
				
			||||||
    C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
 | 
					    C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16)
 | 
				
			||||||
    A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
 | 
					    A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
 | 
				
			||||||
    A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
 | 
					    A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
 | 
				
			||||||
        A, block_size[1], column_major_scales=True)
 | 
					        A, block_size[1], column_major_scales=True
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # === DeepGEMM Implementation ===
 | 
					    # === DeepGEMM Implementation ===
 | 
				
			||||||
    def deepgemm_gemm():
 | 
					    def deepgemm_gemm():
 | 
				
			||||||
        fp8_gemm_nt((A_deepgemm, A_scale_deepgemm),
 | 
					        fp8_gemm_nt(
 | 
				
			||||||
                                       (B_deepgemm, B_scale_deepgemm),
 | 
					            (A_deepgemm, A_scale_deepgemm), (B_deepgemm, B_scale_deepgemm), C_deepgemm
 | 
				
			||||||
                                       C_deepgemm)
 | 
					        )
 | 
				
			||||||
        return C_deepgemm
 | 
					        return C_deepgemm
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # === vLLM Triton Implementation ===
 | 
					    # === vLLM Triton Implementation ===
 | 
				
			||||||
    def vllm_triton_gemm():
 | 
					    def vllm_triton_gemm():
 | 
				
			||||||
        return w8a8_block_fp8_matmul(A_vllm,
 | 
					        return w8a8_triton_block_scaled_mm(
 | 
				
			||||||
 | 
					            A_vllm,
 | 
				
			||||||
            B_vllm,
 | 
					            B_vllm,
 | 
				
			||||||
            A_scale_vllm,
 | 
					            A_scale_vllm,
 | 
				
			||||||
            B_scale_vllm,
 | 
					            B_scale_vllm,
 | 
				
			||||||
            block_size,
 | 
					            block_size,
 | 
				
			||||||
                                     output_dtype=torch.bfloat16)
 | 
					            output_dtype=torch.bfloat16,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # === vLLM CUTLASS Implementation ===
 | 
					    # === vLLM CUTLASS Implementation ===
 | 
				
			||||||
    def vllm_cutlass_gemm():
 | 
					    def vllm_cutlass_gemm():
 | 
				
			||||||
        return ops.cutlass_scaled_mm(A_vllm_cutlass,
 | 
					        return ops.cutlass_scaled_mm(
 | 
				
			||||||
 | 
					            A_vllm_cutlass,
 | 
				
			||||||
            B_vllm.T,
 | 
					            B_vllm.T,
 | 
				
			||||||
            scale_a=A_scale_vllm_cutlass,
 | 
					            scale_a=A_scale_vllm_cutlass,
 | 
				
			||||||
            scale_b=B_scale_vllm.T,
 | 
					            scale_b=B_scale_vllm.T,
 | 
				
			||||||
                                     out_dtype=torch.bfloat16)
 | 
					            out_dtype=torch.bfloat16,
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Run correctness check first
 | 
					    # Run correctness check first
 | 
				
			||||||
    if verbose:
 | 
					    if verbose:
 | 
				
			||||||
@ -89,26 +99,23 @@ def benchmark_shape(m: int,
 | 
				
			|||||||
        print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}")
 | 
					        print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}")
 | 
				
			||||||
        print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}")
 | 
					        print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}")
 | 
				
			||||||
        print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}")
 | 
					        print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}")
 | 
				
			||||||
        print("vLLM Triton vs DeepGEMM difference: "
 | 
					        print(
 | 
				
			||||||
              f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}")
 | 
					            "vLLM Triton vs DeepGEMM difference: "
 | 
				
			||||||
        print("vLLM CUTLASS vs DeepGEMM difference: "
 | 
					            f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}"
 | 
				
			||||||
              f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}")
 | 
					        )
 | 
				
			||||||
 | 
					        print(
 | 
				
			||||||
 | 
					            "vLLM CUTLASS vs DeepGEMM difference: "
 | 
				
			||||||
 | 
					            f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}"
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Benchmark implementations
 | 
					    # Benchmark implementations
 | 
				
			||||||
    implementations = {
 | 
					    implementations = {
 | 
				
			||||||
        "DeepGEMM": deepgemm_gemm,
 | 
					        "DeepGEMM": deepgemm_gemm,
 | 
				
			||||||
        "vLLM Triton": vllm_triton_gemm,
 | 
					        "vLLM Triton": vllm_triton_gemm,
 | 
				
			||||||
        "vLLM CUTLASS": vllm_cutlass_gemm
 | 
					        "vLLM CUTLASS": vllm_cutlass_gemm,
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    benchmark_results = {
 | 
					    benchmark_results = {"shape": {"m": m, "n": n, "k": k}, "implementations": {}}
 | 
				
			||||||
        "shape": {
 | 
					 | 
				
			||||||
            "m": m,
 | 
					 | 
				
			||||||
            "n": n,
 | 
					 | 
				
			||||||
            "k": k
 | 
					 | 
				
			||||||
        },
 | 
					 | 
				
			||||||
        "implementations": {}
 | 
					 | 
				
			||||||
    }
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for name, func in implementations.items():
 | 
					    for name, func in implementations.items():
 | 
				
			||||||
        # Warmup
 | 
					        # Warmup
 | 
				
			||||||
@ -136,38 +143,36 @@ def benchmark_shape(m: int,
 | 
				
			|||||||
            "tflops": tflops,
 | 
					            "tflops": tflops,
 | 
				
			||||||
            "gb_s": gb_s,
 | 
					            "gb_s": gb_s,
 | 
				
			||||||
            "diff": {
 | 
					            "diff": {
 | 
				
			||||||
                "DeepGEMM":
 | 
					                "DeepGEMM": 0.0
 | 
				
			||||||
                0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm),
 | 
					                if name == "DeepGEMM"
 | 
				
			||||||
                "Reference":
 | 
					                else calc_diff(func(), C_deepgemm),
 | 
				
			||||||
                deepgemm_diff if name == "DeepGEMM" else
 | 
					                "Reference": deepgemm_diff
 | 
				
			||||||
                (vllm_triton_diff
 | 
					                if name == "DeepGEMM"
 | 
				
			||||||
                 if name == "vLLM Triton" else vllm_cutlass_diff)
 | 
					                else (vllm_triton_diff if name == "vLLM Triton" else vllm_cutlass_diff),
 | 
				
			||||||
            }
 | 
					            },
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if verbose:
 | 
					        if verbose:
 | 
				
			||||||
            print(
 | 
					            print(f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s")
 | 
				
			||||||
                f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s"
 | 
					 | 
				
			||||||
            )
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Calculate speedups
 | 
					    # Calculate speedups
 | 
				
			||||||
    baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"]
 | 
					    baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"]
 | 
				
			||||||
    for name, data in benchmark_results["implementations"].items():
 | 
					    for name, data in benchmark_results["implementations"].items():
 | 
				
			||||||
        if name != "DeepGEMM":
 | 
					        if name != "DeepGEMM":
 | 
				
			||||||
            speedup = baseline / data["time_ms"]
 | 
					            speedup = baseline / data["time_ms"]
 | 
				
			||||||
            benchmark_results["implementations"][name][
 | 
					            benchmark_results["implementations"][name]["speedup_vs_deepgemm"] = speedup
 | 
				
			||||||
                "speedup_vs_deepgemm"] = speedup
 | 
					 | 
				
			||||||
            if verbose:
 | 
					            if verbose:
 | 
				
			||||||
                print(f"DeepGEMM is {1/speedup:.2f}x "
 | 
					                print(
 | 
				
			||||||
                      f"{'faster' if 1/speedup > 1 else 'slower'} than {name}")
 | 
					                    f"DeepGEMM is {1 / speedup:.2f}x "
 | 
				
			||||||
 | 
					                    f"{'faster' if 1 / speedup > 1 else 'slower'} than {name}"
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][
 | 
					    vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"]["time_ms"]
 | 
				
			||||||
        "time_ms"]
 | 
					    vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"]["time_ms"]
 | 
				
			||||||
    vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][
 | 
					 | 
				
			||||||
        "time_ms"]
 | 
					 | 
				
			||||||
    cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time
 | 
					    cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time
 | 
				
			||||||
    benchmark_results["implementations"]["vLLM CUTLASS"][
 | 
					    benchmark_results["implementations"]["vLLM CUTLASS"]["speedup_vs_triton"] = (
 | 
				
			||||||
        "speedup_vs_triton"] = cutlass_vs_triton
 | 
					        cutlass_vs_triton
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
    if verbose:
 | 
					    if verbose:
 | 
				
			||||||
        print(
 | 
					        print(
 | 
				
			||||||
            f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x "
 | 
					            f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x "
 | 
				
			||||||
@ -179,8 +184,7 @@ def benchmark_shape(m: int,
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
def format_table_row(values, widths):
 | 
					def format_table_row(values, widths):
 | 
				
			||||||
    """Format a row with specified column widths."""
 | 
					    """Format a row with specified column widths."""
 | 
				
			||||||
    return "| " + " | ".join(f"{val:{w}}"
 | 
					    return "| " + " | ".join(f"{val:{w}}" for val, w in zip(values, widths)) + " |"
 | 
				
			||||||
                             for val, w in zip(values, widths)) + " |"
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def print_table(headers, rows, title=None):
 | 
					def print_table(headers, rows, title=None):
 | 
				
			||||||
@ -288,38 +292,50 @@ def run_benchmarks(verbose: bool = False):
 | 
				
			|||||||
    for result in all_results:
 | 
					    for result in all_results:
 | 
				
			||||||
        shape = result["shape"]
 | 
					        shape = result["shape"]
 | 
				
			||||||
        impl_data = result["implementations"]["DeepGEMM"]
 | 
					        impl_data = result["implementations"]["DeepGEMM"]
 | 
				
			||||||
        deepgemm_rows.append([
 | 
					        deepgemm_rows.append(
 | 
				
			||||||
            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
 | 
					            [
 | 
				
			||||||
            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}"
 | 
					                shape["m"],
 | 
				
			||||||
        ])
 | 
					                shape["n"],
 | 
				
			||||||
 | 
					                shape["k"],
 | 
				
			||||||
 | 
					                f"{impl_data['time_us']:.1f}",
 | 
				
			||||||
 | 
					                f"{impl_data['tflops']:.1f}",
 | 
				
			||||||
 | 
					                f"{impl_data['gb_s']:.1f}",
 | 
				
			||||||
 | 
					            ]
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    print_table(deepgemm_headers,
 | 
					    print_table(deepgemm_headers, deepgemm_rows, title="DeepGEMM Implementation:")
 | 
				
			||||||
                deepgemm_rows,
 | 
					 | 
				
			||||||
                title="DeepGEMM Implementation:")
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Print vLLM Triton table
 | 
					    # Print vLLM Triton table
 | 
				
			||||||
    triton_headers = [
 | 
					    triton_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"]
 | 
				
			||||||
        "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"
 | 
					 | 
				
			||||||
    ]
 | 
					 | 
				
			||||||
    triton_rows = []
 | 
					    triton_rows = []
 | 
				
			||||||
    for result in all_results:
 | 
					    for result in all_results:
 | 
				
			||||||
        shape = result["shape"]
 | 
					        shape = result["shape"]
 | 
				
			||||||
        impl_data = result["implementations"]["vLLM Triton"]
 | 
					        impl_data = result["implementations"]["vLLM Triton"]
 | 
				
			||||||
        speedup = impl_data.get("speedup_vs_deepgemm", 1.0)
 | 
					        speedup = impl_data.get("speedup_vs_deepgemm", 1.0)
 | 
				
			||||||
        triton_rows.append([
 | 
					        triton_rows.append(
 | 
				
			||||||
            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
 | 
					            [
 | 
				
			||||||
            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
 | 
					                shape["m"],
 | 
				
			||||||
            format_speedup(speedup)
 | 
					                shape["n"],
 | 
				
			||||||
        ])
 | 
					                shape["k"],
 | 
				
			||||||
 | 
					                f"{impl_data['time_us']:.1f}",
 | 
				
			||||||
 | 
					                f"{impl_data['tflops']:.1f}",
 | 
				
			||||||
 | 
					                f"{impl_data['gb_s']:.1f}",
 | 
				
			||||||
 | 
					                format_speedup(speedup),
 | 
				
			||||||
 | 
					            ]
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    print_table(triton_headers,
 | 
					    print_table(triton_headers, triton_rows, title="vLLM Triton Implementation:")
 | 
				
			||||||
                triton_rows,
 | 
					 | 
				
			||||||
                title="vLLM Triton Implementation:")
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Print vLLM CUTLASS table
 | 
					    # Print vLLM CUTLASS table
 | 
				
			||||||
    cutlass_headers = [
 | 
					    cutlass_headers = [
 | 
				
			||||||
        "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM",
 | 
					        "m",
 | 
				
			||||||
        "vs Triton"
 | 
					        "n",
 | 
				
			||||||
 | 
					        "k",
 | 
				
			||||||
 | 
					        "Time (μs)",
 | 
				
			||||||
 | 
					        "TFLOPS",
 | 
				
			||||||
 | 
					        "GB/s",
 | 
				
			||||||
 | 
					        "vs DeepGEMM",
 | 
				
			||||||
 | 
					        "vs Triton",
 | 
				
			||||||
    ]
 | 
					    ]
 | 
				
			||||||
    cutlass_rows = []
 | 
					    cutlass_rows = []
 | 
				
			||||||
    for result in all_results:
 | 
					    for result in all_results:
 | 
				
			||||||
@ -327,28 +343,27 @@ def run_benchmarks(verbose: bool = False):
 | 
				
			|||||||
        impl_data = result["implementations"]["vLLM CUTLASS"]
 | 
					        impl_data = result["implementations"]["vLLM CUTLASS"]
 | 
				
			||||||
        vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0)
 | 
					        vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0)
 | 
				
			||||||
        vs_triton = impl_data.get("speedup_vs_triton", 1.0)
 | 
					        vs_triton = impl_data.get("speedup_vs_triton", 1.0)
 | 
				
			||||||
        cutlass_rows.append([
 | 
					        cutlass_rows.append(
 | 
				
			||||||
            shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
 | 
					            [
 | 
				
			||||||
            f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
 | 
					                shape["m"],
 | 
				
			||||||
 | 
					                shape["n"],
 | 
				
			||||||
 | 
					                shape["k"],
 | 
				
			||||||
 | 
					                f"{impl_data['time_us']:.1f}",
 | 
				
			||||||
 | 
					                f"{impl_data['tflops']:.1f}",
 | 
				
			||||||
 | 
					                f"{impl_data['gb_s']:.1f}",
 | 
				
			||||||
                format_speedup(vs_deepgemm),
 | 
					                format_speedup(vs_deepgemm),
 | 
				
			||||||
            format_speedup(vs_triton)
 | 
					                format_speedup(vs_triton),
 | 
				
			||||||
        ])
 | 
					            ]
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    print_table(cutlass_headers,
 | 
					    print_table(cutlass_headers, cutlass_rows, title="vLLM CUTLASS Implementation:")
 | 
				
			||||||
                cutlass_rows,
 | 
					 | 
				
			||||||
                title="vLLM CUTLASS Implementation:")
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # Calculate and print averages
 | 
					    # Calculate and print averages
 | 
				
			||||||
    print("\n===== AVERAGE PERFORMANCE =====")
 | 
					    print("\n===== AVERAGE PERFORMANCE =====")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"]
 | 
					    implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"]
 | 
				
			||||||
    avg_metrics = {
 | 
					    avg_metrics = {
 | 
				
			||||||
        impl: {
 | 
					        impl: {"tflops": 0, "gb_s": 0, "time_ms": 0} for impl in implementations
 | 
				
			||||||
            "tflops": 0,
 | 
					 | 
				
			||||||
            "gb_s": 0,
 | 
					 | 
				
			||||||
            "time_ms": 0
 | 
					 | 
				
			||||||
        }
 | 
					 | 
				
			||||||
        for impl in implementations
 | 
					 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for result in all_results:
 | 
					    for result in all_results:
 | 
				
			||||||
@ -366,9 +381,9 @@ def run_benchmarks(verbose: bool = False):
 | 
				
			|||||||
        avg_tflops = avg_metrics[impl]["tflops"] / num_shapes
 | 
					        avg_tflops = avg_metrics[impl]["tflops"] / num_shapes
 | 
				
			||||||
        avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes
 | 
					        avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes
 | 
				
			||||||
        avg_time = avg_metrics[impl]["time_ms"] / num_shapes
 | 
					        avg_time = avg_metrics[impl]["time_ms"] / num_shapes
 | 
				
			||||||
        avg_rows.append([
 | 
					        avg_rows.append(
 | 
				
			||||||
            impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"
 | 
					            [impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"]
 | 
				
			||||||
        ])
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    print_table(avg_headers, avg_rows)
 | 
					    print_table(avg_headers, avg_rows)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -376,21 +391,19 @@ def run_benchmarks(verbose: bool = False):
 | 
				
			|||||||
    avg_speedups = {
 | 
					    avg_speedups = {
 | 
				
			||||||
        "DeepGEMM vs vLLM Triton": 0,
 | 
					        "DeepGEMM vs vLLM Triton": 0,
 | 
				
			||||||
        "DeepGEMM vs vLLM CUTLASS": 0,
 | 
					        "DeepGEMM vs vLLM CUTLASS": 0,
 | 
				
			||||||
        "vLLM CUTLASS vs vLLM Triton": 0
 | 
					        "vLLM CUTLASS vs vLLM Triton": 0,
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    for result in all_results:
 | 
					    for result in all_results:
 | 
				
			||||||
        deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"]
 | 
					        deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"]
 | 
				
			||||||
        vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"]
 | 
					        vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"]
 | 
				
			||||||
        vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][
 | 
					        vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"]["time_ms"]
 | 
				
			||||||
            "time_ms"]
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
        avg_speedups[
 | 
					        avg_speedups["DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
 | 
				
			||||||
            "DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
 | 
					        avg_speedups["DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
 | 
				
			||||||
        avg_speedups[
 | 
					        avg_speedups["vLLM CUTLASS vs vLLM Triton"] += (
 | 
				
			||||||
            "DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
 | 
					            vllm_triton_time / vllm_cutlass_time
 | 
				
			||||||
        avg_speedups[
 | 
					        )
 | 
				
			||||||
            "vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    print("\n===== AVERAGE SPEEDUPS =====")
 | 
					    print("\n===== AVERAGE SPEEDUPS =====")
 | 
				
			||||||
    speedup_headers = ["Comparison", "Speedup"]
 | 
					    speedup_headers = ["Comparison", "Speedup"]
 | 
				
			||||||
@ -408,8 +421,7 @@ def run_benchmarks(verbose: bool = False):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    for result in all_results:
 | 
					    for result in all_results:
 | 
				
			||||||
        for impl in implementations:
 | 
					        for impl in implementations:
 | 
				
			||||||
            avg_diff[impl] += result["implementations"][impl]["diff"][
 | 
					            avg_diff[impl] += result["implementations"][impl]["diff"]["Reference"]
 | 
				
			||||||
                "Reference"]
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    diff_headers = ["Implementation", "Avg Diff vs Reference"]
 | 
					    diff_headers = ["Implementation", "Avg Diff vs Reference"]
 | 
				
			||||||
    diff_rows = []
 | 
					    diff_rows = []
 | 
				
			||||||
 | 
				
			|||||||
@ -2,8 +2,8 @@
 | 
				
			|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
					# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import dataclasses
 | 
					import dataclasses
 | 
				
			||||||
from collections.abc import Iterable
 | 
					from collections.abc import Callable, Iterable
 | 
				
			||||||
from typing import Any, Callable, Optional
 | 
					from typing import Any
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import torch
 | 
					import torch
 | 
				
			||||||
import torch.utils.benchmark as TBenchmark
 | 
					import torch.utils.benchmark as TBenchmark
 | 
				
			||||||
@ -55,7 +55,7 @@ class Bench:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    def __init__(
 | 
					    def __init__(
 | 
				
			||||||
        self,
 | 
					        self,
 | 
				
			||||||
        cuda_graph_params: Optional[CudaGraphBenchParams],
 | 
					        cuda_graph_params: CudaGraphBenchParams | None,
 | 
				
			||||||
        label: str,
 | 
					        label: str,
 | 
				
			||||||
        sub_label: str,
 | 
					        sub_label: str,
 | 
				
			||||||
        description: str,
 | 
					        description: str,
 | 
				
			||||||
 | 
				
			|||||||
@ -55,6 +55,107 @@ output_num_chunks  166.0    99.01   11.80    79.00    90.00    98.00   108.75
 | 
				
			|||||||
----------------------------------------------------------------------------------------------------
 | 
					----------------------------------------------------------------------------------------------------
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### JSON configuration file for synthetic conversations generation
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The input flag `--input-file` is used to determine the input conversations for the benchmark.<br/>
 | 
				
			||||||
 | 
					When the input is a JSON file with the field `"filetype": "generate_conversations"` the tool will generate synthetic multi-turn (questions and answers) conversations.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The file `generate_multi_turn.json` is an example file.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The file must contain the sections `prompt_input` and `prompt_output`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The `prompt_input` section must contain `num_turns`, `prefix_num_tokens` and `num_tokens`:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					* `num_turns` - Number of total turns in the conversation (both user & assistant).<br/>
 | 
				
			||||||
 | 
					The final value will always be rounded to an even number so each user turn has a reply.
 | 
				
			||||||
 | 
					* `prefix_num_tokens` - Tokens added at the start of only the **first user turn** in a conversation (unique per conversation).
 | 
				
			||||||
 | 
					* `num_tokens` - Total token length of each **user** message (one turn).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The `prompt_output` section must contain `num_tokens`:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					* `num_tokens` - Total token length of each **assistant** message (one turn).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					### Random distributions for synthetic conversations generation
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					When creating an input JSON file (such as `generate_multi_turn.json`),<br/>
 | 
				
			||||||
 | 
					every numeric field (such as `num_turns` or `num_tokens`) requires a distribution.<br/>
 | 
				
			||||||
 | 
					The distribution determines how to randomly sample values for the field.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The available distributions are listed below.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					**Note:** The optional `max` field (for lognormal, zipf, and poisson) can be used to cap sampled values at an upper bound.</br>
 | 
				
			||||||
 | 
					Can be used to make sure that the total number of tokens in every request does not exceed `--max-model-len`.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#### constant
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```json
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    "distribution": "constant",
 | 
				
			||||||
 | 
					    "value": 500
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					* `value` - the fixed integer value (always returns the same number).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#### uniform
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```json
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    "distribution": "uniform",
 | 
				
			||||||
 | 
					    "min": 12,
 | 
				
			||||||
 | 
					    "max": 18
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					* `min` - minimum value (inclusive).
 | 
				
			||||||
 | 
					* `max` - maximum value (inclusive), should be equal or larger than min.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#### lognormal
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```json
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    "distribution": "lognormal",
 | 
				
			||||||
 | 
					    "average": 1000,
 | 
				
			||||||
 | 
					    "max": 5000
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					You can parameterize the lognormal distribution in one of two ways:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Using the average and optional median ratio:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					* `average` - target average value of the distribution.
 | 
				
			||||||
 | 
					* `median_ratio` - the ratio of the median to the average; controls the skewness. Must be in the range (0, 1).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					Using the parameters of the underlying normal distribution:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					* `mean` - mean of the underlying normal distribution.
 | 
				
			||||||
 | 
					* `sigma` - standard deviation of the underlying normal distribution.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#### zipf
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```json
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    "distribution": "zipf",
 | 
				
			||||||
 | 
					    "alpha": 1.2,
 | 
				
			||||||
 | 
					    "max": 100
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					* `alpha` - skew parameter (> 1). Larger values produce stronger skew toward smaller integers.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#### poisson
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					```json
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
					    "distribution": "poisson",
 | 
				
			||||||
 | 
					    "alpha": 10,
 | 
				
			||||||
 | 
					    "max": 50
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					* `alpha` - expected value (λ). Also the variance of the distribution.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## ShareGPT Conversations
 | 
					## ShareGPT Conversations
 | 
				
			||||||
 | 
					
 | 
				
			||||||
To run with the ShareGPT data, download the following ShareGPT dataset:
 | 
					To run with the ShareGPT data, download the following ShareGPT dataset:
 | 
				
			||||||
 | 
				
			|||||||
@ -2,7 +2,7 @@
 | 
				
			|||||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
					# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 | 
				
			||||||
from abc import ABC, abstractmethod
 | 
					from abc import ABC, abstractmethod
 | 
				
			||||||
from statistics import mean
 | 
					from statistics import mean
 | 
				
			||||||
from typing import Any, NamedTuple, Optional, Union
 | 
					from typing import Any, NamedTuple
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import numpy as np  # type: ignore
 | 
					import numpy as np  # type: ignore
 | 
				
			||||||
import pandas as pd  # type: ignore
 | 
					import pandas as pd  # type: ignore
 | 
				
			||||||
@ -35,8 +35,8 @@ class Distribution(ABC):
 | 
				
			|||||||
class UniformDistribution(Distribution):
 | 
					class UniformDistribution(Distribution):
 | 
				
			||||||
    def __init__(
 | 
					    def __init__(
 | 
				
			||||||
        self,
 | 
					        self,
 | 
				
			||||||
        min_val: Union[int, float],
 | 
					        min_val: int | float,
 | 
				
			||||||
        max_val: Union[int, float],
 | 
					        max_val: int | float,
 | 
				
			||||||
        is_integer: bool = True,
 | 
					        is_integer: bool = True,
 | 
				
			||||||
    ) -> None:
 | 
					    ) -> None:
 | 
				
			||||||
        self.min_val = min_val
 | 
					        self.min_val = min_val
 | 
				
			||||||
@ -56,7 +56,7 @@ class UniformDistribution(Distribution):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class ConstantDistribution(Distribution):
 | 
					class ConstantDistribution(Distribution):
 | 
				
			||||||
    def __init__(self, value: Union[int, float]) -> None:
 | 
					    def __init__(self, value: int | float) -> None:
 | 
				
			||||||
        self.value = value
 | 
					        self.value = value
 | 
				
			||||||
        self.max_val = value
 | 
					        self.max_val = value
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -68,7 +68,7 @@ class ConstantDistribution(Distribution):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class ZipfDistribution(Distribution):
 | 
					class ZipfDistribution(Distribution):
 | 
				
			||||||
    def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
 | 
					    def __init__(self, alpha: float, max_val: int | None = None) -> None:
 | 
				
			||||||
        self.alpha = alpha
 | 
					        self.alpha = alpha
 | 
				
			||||||
        self.max_val = max_val
 | 
					        self.max_val = max_val
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -83,7 +83,7 @@ class ZipfDistribution(Distribution):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class PoissonDistribution(Distribution):
 | 
					class PoissonDistribution(Distribution):
 | 
				
			||||||
    def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
 | 
					    def __init__(self, alpha: float, max_val: int | None = None) -> None:
 | 
				
			||||||
        self.alpha = alpha
 | 
					        self.alpha = alpha
 | 
				
			||||||
        self.max_val = max_val
 | 
					        self.max_val = max_val
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -99,21 +99,105 @@ class PoissonDistribution(Distribution):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
class LognormalDistribution(Distribution):
 | 
					class LognormalDistribution(Distribution):
 | 
				
			||||||
    def __init__(
 | 
					    def __init__(
 | 
				
			||||||
        self, mean: float, sigma: float, max_val: Optional[int] = None
 | 
					        self,
 | 
				
			||||||
 | 
					        mean: float | None = None,
 | 
				
			||||||
 | 
					        sigma: float | None = None,
 | 
				
			||||||
 | 
					        average: int | None = None,
 | 
				
			||||||
 | 
					        median_ratio: float | None = None,
 | 
				
			||||||
 | 
					        max_val: int | None = None,
 | 
				
			||||||
    ) -> None:
 | 
					    ) -> None:
 | 
				
			||||||
 | 
					        self.average = average
 | 
				
			||||||
 | 
					        self.median_ratio = median_ratio
 | 
				
			||||||
 | 
					        self.max_val = max_val
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if average is not None:
 | 
				
			||||||
 | 
					            if average < 1:
 | 
				
			||||||
 | 
					                raise ValueError("Lognormal average must be positive")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            if mean or sigma:
 | 
				
			||||||
 | 
					                raise ValueError(
 | 
				
			||||||
 | 
					                    "When using lognormal average, you can't provide mean/sigma"
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            if self.median_ratio is None:
 | 
				
			||||||
 | 
					                # Default value that provides relatively wide range of values
 | 
				
			||||||
 | 
					                self.median_ratio = 0.85
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            # Calculate mean/sigma of np.random.lognormal based on the average
 | 
				
			||||||
 | 
					            mean, sigma = self._generate_lognormal_by_median(
 | 
				
			||||||
 | 
					                target_average=self.average, median_ratio=self.median_ratio
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            if mean is None or sigma is None:
 | 
				
			||||||
 | 
					                raise ValueError(
 | 
				
			||||||
 | 
					                    "Must provide both mean and sigma if average is not used"
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					            if mean <= 0 or sigma < 0:
 | 
				
			||||||
 | 
					                raise ValueError(
 | 
				
			||||||
 | 
					                    "Lognormal mean must be positive and sigma must be non-negative"
 | 
				
			||||||
 | 
					                )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Mean and standard deviation of the underlying normal distribution
 | 
				
			||||||
 | 
					        # Based on numpy.random.lognormal
 | 
				
			||||||
        self.mean = mean
 | 
					        self.mean = mean
 | 
				
			||||||
        self.sigma = sigma
 | 
					        self.sigma = sigma
 | 
				
			||||||
        self.max_val = max_val
 | 
					
 | 
				
			||||||
 | 
					    @staticmethod
 | 
				
			||||||
 | 
					    def _generate_lognormal_by_median(
 | 
				
			||||||
 | 
					        target_average: int, median_ratio: float
 | 
				
			||||||
 | 
					    ) -> tuple[float, float]:
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        Compute (mu, sigma) for a lognormal distribution given:
 | 
				
			||||||
 | 
					        - a target average (mean of the distribution)
 | 
				
			||||||
 | 
					        - a ratio of median / mean (controls skewness), assume mean > median
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        Background:
 | 
				
			||||||
 | 
					        If Z ~ Normal(mu, sigma^2), then X = exp(Z) ~ LogNormal(mu, sigma).
 | 
				
			||||||
 | 
					        * mean(X)   = exp(mu + sigma^2 / 2)
 | 
				
			||||||
 | 
					        * median(X) = exp(mu)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        So:
 | 
				
			||||||
 | 
					        median / mean = exp(mu) / exp(mu + sigma^2 / 2)
 | 
				
			||||||
 | 
					                      = exp(-sigma^2 / 2)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        Rearranging:
 | 
				
			||||||
 | 
					        sigma^2 = 2 * ln(mean / median)
 | 
				
			||||||
 | 
					        mu      = ln(median)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        This gives a unique (mu, sigma) for any valid mean and median.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        # Check input validity: median must be smaller than mean
 | 
				
			||||||
 | 
					        if median_ratio <= 0 or median_ratio >= 1:
 | 
				
			||||||
 | 
					            raise ValueError("median_ratio must be in range (0, 1)")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        target_median = target_average * median_ratio
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Solve sigma^2 = 2 * ln(mean / median)
 | 
				
			||||||
 | 
					        sigma = np.sqrt(2 * np.log(target_average / target_median))
 | 
				
			||||||
 | 
					        mu = np.log(target_median)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        return mu, sigma
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def sample(self, size: int = 1) -> np.ndarray:
 | 
					    def sample(self, size: int = 1) -> np.ndarray:
 | 
				
			||||||
        samples = np.random.lognormal(mean=self.mean, sigma=self.sigma, size=size)
 | 
					        samples = np.random.lognormal(mean=self.mean, sigma=self.sigma, size=size)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if self.average is not None:
 | 
				
			||||||
 | 
					            # Scale to average
 | 
				
			||||||
 | 
					            samples *= self.average / samples.mean()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if self.max_val:
 | 
					        if self.max_val:
 | 
				
			||||||
            samples = np.minimum(samples, self.max_val)
 | 
					            samples = np.minimum(samples, self.max_val)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        return np.round(samples).astype(int)
 | 
					        return np.round(samples).astype(int)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __repr__(self) -> str:
 | 
					    def __repr__(self) -> str:
 | 
				
			||||||
        return f"LognormalDistribution[{self.mean}, {self.sigma}]"
 | 
					        if self.average:
 | 
				
			||||||
 | 
					            return (
 | 
				
			||||||
 | 
					                f"LognormalDistribution[{self.average}, "
 | 
				
			||||||
 | 
					                f"{self.median_ratio}, {self.max_val}]"
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					        return f"LognormalDistribution[{self.mean}, {self.sigma}, {self.max_val}]"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class GenConvArgs(NamedTuple):
 | 
					class GenConvArgs(NamedTuple):
 | 
				
			||||||
@ -173,10 +257,21 @@ def get_random_distribution(
 | 
				
			|||||||
        return PoissonDistribution(conf["alpha"], max_val=max_val)
 | 
					        return PoissonDistribution(conf["alpha"], max_val=max_val)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    elif distribution == "lognormal":
 | 
					    elif distribution == "lognormal":
 | 
				
			||||||
 | 
					        max_val = conf.get("max", None)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if "average" in conf:
 | 
				
			||||||
 | 
					            # Infer lognormal mean/sigma (numpy) from input average
 | 
				
			||||||
 | 
					            median_ratio = conf.get("median_ratio", None)
 | 
				
			||||||
 | 
					            return LognormalDistribution(
 | 
				
			||||||
 | 
					                average=conf["average"], median_ratio=median_ratio, max_val=max_val
 | 
				
			||||||
 | 
					            )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # Use mean/sigma directly (for full control over the distribution)
 | 
				
			||||||
        verify_field_exists(conf, "mean", section, subsection)
 | 
					        verify_field_exists(conf, "mean", section, subsection)
 | 
				
			||||||
        verify_field_exists(conf, "sigma", section, subsection)
 | 
					        verify_field_exists(conf, "sigma", section, subsection)
 | 
				
			||||||
        max_val = conf.get("max", None)
 | 
					        return LognormalDistribution(
 | 
				
			||||||
        return LognormalDistribution(conf["mean"], conf["sigma"], max_val=max_val)
 | 
					            mean=conf["mean"], sigma=conf["sigma"], max_val=max_val
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    elif distribution == "uniform":
 | 
					    elif distribution == "uniform":
 | 
				
			||||||
        verify_field_exists(conf, "min", section, subsection)
 | 
					        verify_field_exists(conf, "min", section, subsection)
 | 
				
			||||||
 | 
				
			|||||||
@ -13,7 +13,7 @@ from datetime import datetime
 | 
				
			|||||||
from enum import Enum
 | 
					from enum import Enum
 | 
				
			||||||
from http import HTTPStatus
 | 
					from http import HTTPStatus
 | 
				
			||||||
from statistics import mean
 | 
					from statistics import mean
 | 
				
			||||||
from typing import NamedTuple, Optional, Union
 | 
					from typing import NamedTuple
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import aiohttp  # type: ignore
 | 
					import aiohttp  # type: ignore
 | 
				
			||||||
import numpy as np  # type: ignore
 | 
					import numpy as np  # type: ignore
 | 
				
			||||||
@ -46,9 +46,9 @@ class ConversationSampling(str, Enum):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
class ClientArgs(NamedTuple):
 | 
					class ClientArgs(NamedTuple):
 | 
				
			||||||
    seed: int
 | 
					    seed: int
 | 
				
			||||||
    max_num_requests: Optional[int]
 | 
					    max_num_requests: int | None
 | 
				
			||||||
    skip_first_turn: bool
 | 
					    skip_first_turn: bool
 | 
				
			||||||
    max_turns: Optional[int]
 | 
					    max_turns: int | None
 | 
				
			||||||
    max_active_conversations: int
 | 
					    max_active_conversations: int
 | 
				
			||||||
    verbose: bool
 | 
					    verbose: bool
 | 
				
			||||||
    print_content: bool
 | 
					    print_content: bool
 | 
				
			||||||
@ -109,9 +109,9 @@ class RequestStats(NamedTuple):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
class MetricStats:
 | 
					class MetricStats:
 | 
				
			||||||
    def __init__(self) -> None:
 | 
					    def __init__(self) -> None:
 | 
				
			||||||
        self.min: Optional[float] = None
 | 
					        self.min: float | None = None
 | 
				
			||||||
        self.max: Optional[float] = None
 | 
					        self.max: float | None = None
 | 
				
			||||||
        self.avg: Optional[float] = None
 | 
					        self.avg: float | None = None
 | 
				
			||||||
        self.sum = 0.0
 | 
					        self.sum = 0.0
 | 
				
			||||||
        self.count = 0
 | 
					        self.count = 0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -143,7 +143,7 @@ class MovingAverage:
 | 
				
			|||||||
        self.index = 0
 | 
					        self.index = 0
 | 
				
			||||||
        self.sum = 0.0
 | 
					        self.sum = 0.0
 | 
				
			||||||
        self.count = 0
 | 
					        self.count = 0
 | 
				
			||||||
        self.avg: Optional[float] = None
 | 
					        self.avg: float | None = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def update(self, new_value: float) -> None:
 | 
					    def update(self, new_value: float) -> None:
 | 
				
			||||||
        if self.count < self.window_size:
 | 
					        if self.count < self.window_size:
 | 
				
			||||||
@ -169,7 +169,7 @@ class MovingAverage:
 | 
				
			|||||||
class DebugStats:
 | 
					class DebugStats:
 | 
				
			||||||
    def __init__(self, logger: logging.Logger, window_size: int) -> None:
 | 
					    def __init__(self, logger: logging.Logger, window_size: int) -> None:
 | 
				
			||||||
        self.logger = logger
 | 
					        self.logger = logger
 | 
				
			||||||
        self.metrics: dict[str, Union[MovingAverage, MetricStats]] = {
 | 
					        self.metrics: dict[str, MovingAverage | MetricStats] = {
 | 
				
			||||||
            "moving_avg_ttft_ms": MovingAverage(window_size),
 | 
					            "moving_avg_ttft_ms": MovingAverage(window_size),
 | 
				
			||||||
            "moving_avg_tpot_ms": MovingAverage(window_size),
 | 
					            "moving_avg_tpot_ms": MovingAverage(window_size),
 | 
				
			||||||
            "ttft_ms": MetricStats(),
 | 
					            "ttft_ms": MetricStats(),
 | 
				
			||||||
@ -198,14 +198,6 @@ class DebugStats:
 | 
				
			|||||||
        self.logger.info("-" * 50)
 | 
					        self.logger.info("-" * 50)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# Must support Python 3.8, we can't use str.removeprefix(prefix)
 | 
					 | 
				
			||||||
# introduced in Python 3.9
 | 
					 | 
				
			||||||
def remove_prefix(text: str, prefix: str) -> str:
 | 
					 | 
				
			||||||
    if text.startswith(prefix):
 | 
					 | 
				
			||||||
        return text[len(prefix) :]
 | 
					 | 
				
			||||||
    return text
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def nanosec_to_millisec(value: float) -> float:
 | 
					def nanosec_to_millisec(value: float) -> float:
 | 
				
			||||||
    return value / 1000000.0
 | 
					    return value / 1000000.0
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -220,8 +212,8 @@ async def send_request(
 | 
				
			|||||||
    chat_url: str,
 | 
					    chat_url: str,
 | 
				
			||||||
    model: str,
 | 
					    model: str,
 | 
				
			||||||
    stream: bool = True,
 | 
					    stream: bool = True,
 | 
				
			||||||
    min_tokens: Optional[int] = None,
 | 
					    min_tokens: int | None = None,
 | 
				
			||||||
    max_tokens: Optional[int] = None,
 | 
					    max_tokens: int | None = None,
 | 
				
			||||||
) -> ServerResponse:
 | 
					) -> ServerResponse:
 | 
				
			||||||
    payload = {
 | 
					    payload = {
 | 
				
			||||||
        "model": model,
 | 
					        "model": model,
 | 
				
			||||||
@ -250,9 +242,9 @@ async def send_request(
 | 
				
			|||||||
    timeout = aiohttp.ClientTimeout(total=timeout_sec)
 | 
					    timeout = aiohttp.ClientTimeout(total=timeout_sec)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    valid_response = True
 | 
					    valid_response = True
 | 
				
			||||||
    ttft: Optional[float] = None
 | 
					    ttft: float | None = None
 | 
				
			||||||
    chunk_delay: list[int] = []
 | 
					    chunk_delay: list[int] = []
 | 
				
			||||||
    latency: Optional[float] = None
 | 
					    latency: float | None = None
 | 
				
			||||||
    first_chunk = ""
 | 
					    first_chunk = ""
 | 
				
			||||||
    generated_text = ""
 | 
					    generated_text = ""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -269,7 +261,7 @@ async def send_request(
 | 
				
			|||||||
                if not chunk_bytes:
 | 
					                if not chunk_bytes:
 | 
				
			||||||
                    continue
 | 
					                    continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
 | 
					                chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
 | 
				
			||||||
                if chunk == "[DONE]":
 | 
					                if chunk == "[DONE]":
 | 
				
			||||||
                    # End of stream
 | 
					                    # End of stream
 | 
				
			||||||
                    latency = time.perf_counter_ns() - start_time
 | 
					                    latency = time.perf_counter_ns() - start_time
 | 
				
			||||||
@ -364,7 +356,7 @@ async def send_turn(
 | 
				
			|||||||
    req_args: RequestArgs,
 | 
					    req_args: RequestArgs,
 | 
				
			||||||
    verbose: bool,
 | 
					    verbose: bool,
 | 
				
			||||||
    verify_output: bool,
 | 
					    verify_output: bool,
 | 
				
			||||||
) -> Optional[RequestStats]:
 | 
					) -> RequestStats | None:
 | 
				
			||||||
    assert messages_to_use > 0
 | 
					    assert messages_to_use > 0
 | 
				
			||||||
    assert messages_to_use <= len(conversation_messages)
 | 
					    assert messages_to_use <= len(conversation_messages)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -644,7 +636,7 @@ async def client_main(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
            if args.verbose:
 | 
					            if args.verbose:
 | 
				
			||||||
                curr_time_sec: float = time.perf_counter()
 | 
					                curr_time_sec: float = time.perf_counter()
 | 
				
			||||||
                time_since_last_turn: Union[str, float] = "N/A"
 | 
					                time_since_last_turn: str | float = "N/A"
 | 
				
			||||||
                if conv_id in time_of_last_turn:
 | 
					                if conv_id in time_of_last_turn:
 | 
				
			||||||
                    time_since_last_turn = round(
 | 
					                    time_since_last_turn = round(
 | 
				
			||||||
                        curr_time_sec - time_of_last_turn[conv_id], 3
 | 
					                        curr_time_sec - time_of_last_turn[conv_id], 3
 | 
				
			||||||
@ -769,7 +761,7 @@ def get_client_config(
 | 
				
			|||||||
            "Number of conversations must be equal or larger than the number of clients"
 | 
					            "Number of conversations must be equal or larger than the number of clients"
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    max_req_per_client: Optional[int] = None
 | 
					    max_req_per_client: int | None = None
 | 
				
			||||||
    if args.max_num_requests is not None:
 | 
					    if args.max_num_requests is not None:
 | 
				
			||||||
        # Max number of requests per client
 | 
					        # Max number of requests per client
 | 
				
			||||||
        req_per_client = args.max_num_requests // args.num_clients
 | 
					        req_per_client = args.max_num_requests // args.num_clients
 | 
				
			||||||
@ -936,13 +928,13 @@ async def main_mp(
 | 
				
			|||||||
                    f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}"  # noqa: E501
 | 
					                    f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}"  # noqa: E501
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3)
 | 
					                rps: str | float = round(len(client_metrics) / runtime_sec, 3)
 | 
				
			||||||
                if len(client_metrics) < (5 * bench_args.num_clients):
 | 
					                if len(client_metrics) < (5 * bench_args.num_clients):
 | 
				
			||||||
                    # Do not estimate the RPS if the number of samples is very low
 | 
					                    # Do not estimate the RPS if the number of samples is very low
 | 
				
			||||||
                    # (threshold can be tuned if needed)
 | 
					                    # (threshold can be tuned if needed)
 | 
				
			||||||
                    rps = "N/A"
 | 
					                    rps = "N/A"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
                runtime_left_sec: Union[str, float] = round(
 | 
					                runtime_left_sec: str | float = round(
 | 
				
			||||||
                    (runtime_sec / finished_convs) * (total_convs - finished_convs), 3
 | 
					                    (runtime_sec / finished_convs) * (total_convs - finished_convs), 3
 | 
				
			||||||
                )
 | 
					                )
 | 
				
			||||||
                if percent < 0.05:
 | 
					                if percent < 0.05:
 | 
				
			||||||
@ -1032,7 +1024,7 @@ def process_statistics(
 | 
				
			|||||||
    warmup_percentages: list[float],
 | 
					    warmup_percentages: list[float],
 | 
				
			||||||
    test_params: dict,
 | 
					    test_params: dict,
 | 
				
			||||||
    verbose: bool,
 | 
					    verbose: bool,
 | 
				
			||||||
    gen_conv_args: Optional[GenConvArgs] = None,
 | 
					    gen_conv_args: GenConvArgs | None = None,
 | 
				
			||||||
    excel_output: bool = False,
 | 
					    excel_output: bool = False,
 | 
				
			||||||
) -> None:
 | 
					) -> None:
 | 
				
			||||||
    if len(client_metrics) == 0:
 | 
					    if len(client_metrics) == 0:
 | 
				
			||||||
@ -1259,7 +1251,7 @@ async def main() -> None:
 | 
				
			|||||||
        default=None,
 | 
					        default=None,
 | 
				
			||||||
        help="The model name used in the API. "
 | 
					        help="The model name used in the API. "
 | 
				
			||||||
        "If not specified, the model name will be the "
 | 
					        "If not specified, the model name will be the "
 | 
				
			||||||
        "same as the ``--model`` argument. ",
 | 
					        "same as the `--model` argument. ",
 | 
				
			||||||
    )
 | 
					    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    parser.add_argument(
 | 
					    parser.add_argument(
 | 
				
			||||||
 | 
				
			|||||||
@ -13,7 +13,7 @@ import argparse
 | 
				
			|||||||
import json
 | 
					import json
 | 
				
			||||||
import random
 | 
					import random
 | 
				
			||||||
from statistics import mean
 | 
					from statistics import mean
 | 
				
			||||||
from typing import Any, Optional
 | 
					from typing import Any
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pandas as pd  # type: ignore
 | 
					import pandas as pd  # type: ignore
 | 
				
			||||||
import tqdm  # type: ignore
 | 
					import tqdm  # type: ignore
 | 
				
			||||||
@ -25,7 +25,7 @@ def has_non_english_chars(text: str) -> bool:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def content_is_valid(
 | 
					def content_is_valid(
 | 
				
			||||||
    content: str, min_content_len: Optional[int], max_content_len: Optional[int]
 | 
					    content: str, min_content_len: int | None, max_content_len: int | None
 | 
				
			||||||
) -> bool:
 | 
					) -> bool:
 | 
				
			||||||
    if min_content_len and len(content) < min_content_len:
 | 
					    if min_content_len and len(content) < min_content_len:
 | 
				
			||||||
        return False
 | 
					        return False
 | 
				
			||||||
@ -37,7 +37,7 @@ def content_is_valid(
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def print_stats(
 | 
					def print_stats(
 | 
				
			||||||
    conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None
 | 
					    conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None
 | 
				
			||||||
) -> None:
 | 
					) -> None:
 | 
				
			||||||
    # Collect statistics
 | 
					    # Collect statistics
 | 
				
			||||||
    stats = []
 | 
					    stats = []
 | 
				
			||||||
@ -109,12 +109,12 @@ def convert_sharegpt_to_openai(
 | 
				
			|||||||
    seed: int,
 | 
					    seed: int,
 | 
				
			||||||
    input_file: str,
 | 
					    input_file: str,
 | 
				
			||||||
    output_file: str,
 | 
					    output_file: str,
 | 
				
			||||||
    max_items: Optional[int],
 | 
					    max_items: int | None,
 | 
				
			||||||
    min_content_len: Optional[int] = None,
 | 
					    min_content_len: int | None = None,
 | 
				
			||||||
    max_content_len: Optional[int] = None,
 | 
					    max_content_len: int | None = None,
 | 
				
			||||||
    min_turns: Optional[int] = None,
 | 
					    min_turns: int | None = None,
 | 
				
			||||||
    max_turns: Optional[int] = None,
 | 
					    max_turns: int | None = None,
 | 
				
			||||||
    model: Optional[str] = None,
 | 
					    model: str | None = None,
 | 
				
			||||||
) -> None:
 | 
					) -> None:
 | 
				
			||||||
    if min_turns and max_turns:
 | 
					    if min_turns and max_turns:
 | 
				
			||||||
        assert min_turns <= max_turns
 | 
					        assert min_turns <= max_turns
 | 
				
			||||||
 | 
				
			|||||||
@ -15,9 +15,8 @@
 | 
				
			|||||||
        },
 | 
					        },
 | 
				
			||||||
        "prefix_num_tokens": {
 | 
					        "prefix_num_tokens": {
 | 
				
			||||||
            "distribution": "lognormal",
 | 
					            "distribution": "lognormal",
 | 
				
			||||||
            "mean": 6,
 | 
					            "average": 1000,
 | 
				
			||||||
            "sigma": 4,
 | 
					            "max": 5000
 | 
				
			||||||
            "max": 1500
 | 
					 | 
				
			||||||
        },
 | 
					        },
 | 
				
			||||||
        "num_tokens": {
 | 
					        "num_tokens": {
 | 
				
			||||||
            "distribution": "uniform",
 | 
					            "distribution": "uniform",
 | 
				
			||||||
 | 
				
			|||||||
@ -1,49 +0,0 @@
 | 
				
			|||||||
# This local pyproject file is part of the migration from yapf to ruff format.
 | 
					 | 
				
			||||||
# It uses the same core rules as the main pyproject.toml file, but with the
 | 
					 | 
				
			||||||
# following differences:
 | 
					 | 
				
			||||||
# - ruff line length is overridden to 88
 | 
					 | 
				
			||||||
# - deprecated typing ignores (UP006, UP035) have been removed
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[tool.ruff]
 | 
					 | 
				
			||||||
line-length = 88
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[tool.ruff.lint.per-file-ignores]
 | 
					 | 
				
			||||||
"vllm/third_party/**" = ["ALL"]
 | 
					 | 
				
			||||||
"vllm/version.py" = ["F401"]
 | 
					 | 
				
			||||||
"vllm/_version.py" = ["ALL"]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[tool.ruff.lint]
 | 
					 | 
				
			||||||
select = [
 | 
					 | 
				
			||||||
    # pycodestyle
 | 
					 | 
				
			||||||
    "E",
 | 
					 | 
				
			||||||
    # Pyflakes
 | 
					 | 
				
			||||||
    "F",
 | 
					 | 
				
			||||||
    # pyupgrade
 | 
					 | 
				
			||||||
    "UP",
 | 
					 | 
				
			||||||
    # flake8-bugbear
 | 
					 | 
				
			||||||
    "B",
 | 
					 | 
				
			||||||
    # flake8-simplify
 | 
					 | 
				
			||||||
    "SIM",
 | 
					 | 
				
			||||||
    # isort
 | 
					 | 
				
			||||||
    "I",
 | 
					 | 
				
			||||||
    # flake8-logging-format
 | 
					 | 
				
			||||||
    "G",
 | 
					 | 
				
			||||||
]
 | 
					 | 
				
			||||||
ignore = [
 | 
					 | 
				
			||||||
    # star imports
 | 
					 | 
				
			||||||
    "F405", "F403",
 | 
					 | 
				
			||||||
    # lambda expression assignment
 | 
					 | 
				
			||||||
    "E731",
 | 
					 | 
				
			||||||
    # Loop control variable not used within loop body
 | 
					 | 
				
			||||||
    "B007",
 | 
					 | 
				
			||||||
    # f-string format
 | 
					 | 
				
			||||||
    "UP032",
 | 
					 | 
				
			||||||
    # Can remove once 3.10+ is the minimum Python version
 | 
					 | 
				
			||||||
    "UP007",
 | 
					 | 
				
			||||||
]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[tool.ruff.lint.isort]
 | 
					 | 
				
			||||||
known-first-party = ["vllm"]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
[tool.ruff.format]
 | 
					 | 
				
			||||||
docstring-code-format = true
 | 
					 | 
				
			||||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user