mirror of
				https://github.com/vllm-project/vllm.git
				synced 2025-10-20 23:03:52 +08:00 
			
		
		
		
	Compare commits
	
		
			660 Commits
		
	
	
		
			v0.11.0rc4
			...
			woosuk/rm-
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 6f47333c4e | |||
| 83e760c57d | |||
| c2bba69065 | |||
| e133d6d218 | |||
| a1946c9f61 | |||
| 9f020f4f31 | |||
| 3b45075206 | |||
| 168e578efc | |||
| 6ac5e06f7c | |||
| 5c2acb270a | |||
| b26b70bec4 | |||
| ab4be40fc5 | |||
| 245e4f2c01 | |||
| 1d165d6d85 | |||
| 83004020fd | |||
| 12e21701e7 | |||
| 30a33b92ee | |||
| 7c572544e4 | |||
| c312320764 | |||
| c981f0ea78 | |||
| 6367bde739 | |||
| f50cc221ea | |||
| acedc74b1a | |||
| d29483b58a | |||
| 950cf9e58e | |||
| 3125d79950 | |||
| e33ee23ee3 | |||
| b10c64c834 | |||
| 0925b28a8e | |||
| 99722d5f0e | |||
| 4c91a28e30 | |||
| b038d9c40c | |||
| 2ba60ec7fe | |||
| bd7157a071 | |||
| be429d0cfd | |||
| c253745eb8 | |||
| daec4d2624 | |||
| 6c9fdbf725 | |||
| 483ea64611 | |||
| e20eba753b | |||
| bbc1b29665 | |||
| acb1bfa601 | |||
| 75c7ad9918 | |||
| 5550ff9c25 | |||
| 3aeb19a39e | |||
| 8c017b3490 | |||
| 9c2c2287a0 | |||
| fec2b341ad | |||
| 87bc0c492f | |||
| fe3b9372ad | |||
| bde9e2272a | |||
| 08405609cc | |||
| ab81379ea6 | |||
| 4ffd6e8942 | |||
| 965c5f4914 | |||
| 4d055ef465 | |||
| 17c540a993 | |||
| 4d4d6bad19 | |||
| 11ae016bd7 | |||
| 41d3071918 | |||
| fb5e10d3fb | |||
| b2f78cbad4 | |||
| 23583ee28c | |||
| 01c977e96d | |||
| b3dda72c23 | |||
| fb0571b077 | |||
| 2ed8b6b3d0 | |||
| 013abde6ef | |||
| a5464dcf92 | |||
| ac3ed5a815 | |||
| e6ba2000ae | |||
| aa255ff55a | |||
| 7bb736d00e | |||
| 9f4e30904b | |||
| 5afd3276df | |||
| 43721bc67f | |||
| 02d709a6f1 | |||
| 4a510ab487 | |||
| 314fa8abbf | |||
| 334535b6fb | |||
| dcbb3f1871 | |||
| 00417f4e44 | |||
| ed344f4116 | |||
| e51928793e | |||
| d2740fafbf | |||
| 17838e50ef | |||
| 44c8555621 | |||
| f7d318de2b | |||
| 76f0d05bc6 | |||
| 7d8975de84 | |||
| 785d8b6410 | |||
| f6cdc9a02f | |||
| 509cdc0370 | |||
| 9b6504c307 | |||
| e19b16dde6 | |||
| 582f2c6be7 | |||
| f8a0acbdbe | |||
| 1317034379 | |||
| 0ecc553ee6 | |||
| f96bc3649c | |||
| 938c43ea7f | |||
| 0a9ef0cfce | |||
| e5b438a247 | |||
| 0b99f5d302 | |||
| 1f491aa0c8 | |||
| de92d916fe | |||
| a1063628a4 | |||
| d796375258 | |||
| 14f8456344 | |||
| 4794c2bd92 | |||
| d3cbaa08dc | |||
| 828523ad8e | |||
| 136a17fe6e | |||
| f57438338d | |||
| 5d598680e3 | |||
| 8f4b313c37 | |||
| f93e348010 | |||
| f54f85129e | |||
| d4d1a6024f | |||
| db1764e4e0 | |||
| 7f83b4ee8e | |||
| 5c3bae1a6a | |||
| 5210dc3940 | |||
| 650b51f9f9 | |||
| 6256697997 | |||
| 71557a5f7c | |||
| f3c378ffa7 | |||
| f5ed68ef63 | |||
| efdef57b1f | |||
| b8a4572157 | |||
| 302ef403a2 | |||
| 8865da157b | |||
| f0862eae43 | |||
| 8c851f6d04 | |||
| 7cfa420f49 | |||
| a27b288e4a | |||
| e471d7ca7e | |||
| c43ca8259e | |||
| 85a65e7f51 | |||
| a2986b3e33 | |||
| 96b9aa5aa0 | |||
| e66d787bce | |||
| bfad142e25 | |||
| 9354660036 | |||
| 07ca70af8d | |||
| 2dcd12d357 | |||
| 579d2e5458 | |||
| 0512c04aee | |||
| 7e0ef4084a | |||
| 4aed506b65 | |||
| a86b4c58e8 | |||
| ff4810ba73 | |||
| 9d6964926e | |||
| 0e65818910 | |||
| 380f17527c | |||
| b92ab3deda | |||
| acaa2c0a4a | |||
| 82af928c41 | |||
| 87efc681db | |||
| c3a722fcb2 | |||
| aba48f7db1 | |||
| 04b5f9802d | |||
| efc8f7d814 | |||
| 6d87a2838c | |||
| e6cdbd6792 | |||
| df850c4912 | |||
| 720394de43 | |||
| 88a49745af | |||
| ca683a2a72 | |||
| e9f1b8c9e9 | |||
| ea97940d6c | |||
| fdd32750f0 | |||
| c715ba3735 | |||
| 9c4cb68339 | |||
| 780eb03d9b | |||
| ef9676a1f1 | |||
| 70b1b330e1 | |||
| d1d063a588 | |||
| 7e6edb1469 | |||
| 74704d4553 | |||
| d2f816d6ff | |||
| 577d498212 | |||
| fd85c9f426 | |||
| d32c611f45 | |||
| 01ad27faff | |||
| 481545b397 | |||
| d3cc8427c0 | |||
| 4821ac1b4d | |||
| 4497c8f821 | |||
| 2e36cdbe2b | |||
| fe3edb4cf0 | |||
| 29350922c6 | |||
| 8ae169286f | |||
| 8a0af6a561 | |||
| cfded80793 | |||
| b59dd19b55 | |||
| 3e051bda82 | |||
| 8317f72354 | |||
| d8bebb008a | |||
| 35bc22f23c | |||
| fa96fb9c70 | |||
| e3fdb627d9 | |||
| 7200a21cd1 | |||
| 577c72a227 | |||
| 314285d4f2 | |||
| d2a7938582 | |||
| 89342ce4c0 | |||
| f89f599395 | |||
| e251e457c5 | |||
| afc47e4de7 | |||
| e3b90c1ba2 | |||
| 134f70b3ed | |||
| a1b2d658ee | |||
| 5c7fe25491 | |||
| 53c9a7cee2 | |||
| 0d21b9b51e | |||
| 10214b6935 | |||
| 4a61950f4d | |||
| 3263799056 | |||
| 8e67b2557a | |||
| 4073c82c4e | |||
| 767c3ab869 | |||
| 4f207c7174 | |||
| 782505ed8e | |||
| 98f30b8cba | |||
| 3cd36660f7 | |||
| 46ad73955a | |||
| 41f3884438 | |||
| 60e419c1ee | |||
| 7ef6052804 | |||
| 4fca1a1bd2 | |||
| a6049be73c | |||
| 18ed7746ea | |||
| 8fcaaf6a16 | |||
| 9bb38130cb | |||
| b91d8db873 | |||
| 045b396d09 | |||
| 76852017ea | |||
| 82e64c7a20 | |||
| 4ca204055e | |||
| c5c8f5ea59 | |||
| 01653a917b | |||
| 0cd103e7cb | |||
| 5be7ca1b99 | |||
| f0a30a067b | |||
| 9d6cff3ede | |||
| a25f2adee9 | |||
| d0bed837ac | |||
| f7ee69868a | |||
| d2a71530c1 | |||
| 086609de64 | |||
| 727144bed1 | |||
| 55392bc879 | |||
| ddaff2938e | |||
| 27ed39a347 | |||
| 8f8474fbe3 | |||
| be067861c6 | |||
| 5bc26c438d | |||
| eef921f45e | |||
| e317414ce1 | |||
| 949cb0170d | |||
| e94cfd51da | |||
| 7c12763b24 | |||
| 3b780a4bbb | |||
| 30f78af147 | |||
| 19a9b169bf | |||
| 96ad65b7fe | |||
| 8d2b8c0ff2 | |||
| b2155ed317 | |||
| 910abdbd08 | |||
| cddce79fda | |||
| e519281920 | |||
| 7b03584de8 | |||
| ae9d0e7da5 | |||
| 0e67102d93 | |||
| f4ba2061cf | |||
| 1e6848a65d | |||
| 67661375fa | |||
| 213b64452a | |||
| 784c231151 | |||
| 606b00e80f | |||
| 720d3cd0f0 | |||
| ab196edefb | |||
| 3ee202ea1e | |||
| ad430a67ca | |||
| 6f0f570c43 | |||
| b545a0b207 | |||
| 29255cfc3b | |||
| da4455609d | |||
| aafb99a4d4 | |||
| 757fa4a4da | |||
| c6187f55f7 | |||
| 8983e0216f | |||
| 1ee35382cb | |||
| 6e783bc54b | |||
| c9d33c60dc | |||
| 2e54db4d2b | |||
| 44f633dba1 | |||
| a462331e36 | |||
| 4069db3f2e | |||
| 0d37450eb7 | |||
| 47e66c24e2 | |||
| 3b736e1c38 | |||
| 2c1c7dfb35 | |||
| e246ad6f0c | |||
| 5728da11ea | |||
| 92be3f3517 | |||
| d1ddf340c8 | |||
| ec10fd0abc | |||
| 0426e3c5e1 | |||
| 4bdf7ac593 | |||
| dc7976dd9f | |||
| e4791438ed | |||
| e6e898f95d | |||
| ddcbc2f334 | |||
| a83ff278d6 | |||
| cf4cd6c24f | |||
| b960441812 | |||
| 1317028aa8 | |||
| 5e49c3e777 | |||
| 0d7c3cb51d | |||
| 1b2c440cd6 | |||
| 0f29dca988 | |||
| d24cf322e1 | |||
| d17f0fbf30 | |||
| 43ab8cfaa5 | |||
| de253d63b7 | |||
| 8bd696fa53 | |||
| bb6d8c21f9 | |||
| ebf6ef1a9b | |||
| 0c52d6ef81 | |||
| 467a4f98f1 | |||
| e614ab7806 | |||
| 2a03f93de9 | |||
| da364615fc | |||
| f08919b7d1 | |||
| 93f2c0aa08 | |||
| 4ebc9108a7 | |||
| e1ba235668 | |||
| b82f4307c9 | |||
| 76879cc160 | |||
| b25d7b5657 | |||
| e09d1753ec | |||
| 4ba8875749 | |||
| 6273fe8d3d | |||
| 9fb3ae4e6f | |||
| 76afe4edf8 | |||
| c1b06fc182 | |||
| 241b4cfe66 | |||
| 9fc983c707 | |||
| 2f99f2f506 | |||
| 338b1bf04f | |||
| e39dc46f8f | |||
| 10c75b5439 | |||
| f9582fd8f4 | |||
| f377333bd7 | |||
| f8607863d8 | |||
| 335b28f7d1 | |||
| 5e65d6b2ad | |||
| 0d4f48fa10 | |||
| 127c8b782a | |||
| cd9890544b | |||
| 067da2d1df | |||
| 046118b938 | |||
| b32260ab85 | |||
| f80e7866c0 | |||
| 31a4b3e6c4 | |||
| caf8b1c084 | |||
| 1b86bd8e18 | |||
| 59012df99b | |||
| 3d1f67616d | |||
| 6ebaf43ee4 | |||
| 0c824fc46f | |||
| eb577e4655 | |||
| 8f36850f73 | |||
| 29fd2662ba | |||
| 30a3e5af69 | |||
| a38c1bfe09 | |||
| 320feae6f5 | |||
| 1e4ecca1d0 | |||
| c0a7b89d8e | |||
| 6f59beaf0b | |||
| 41f1cf38f2 | |||
| 08d26a1b7e | |||
| 63773a6200 | |||
| 883b42896a | |||
| e1098ced95 | |||
| d100d78eb3 | |||
| 7e4cd070b0 | |||
| 46b0779996 | |||
| de342585ff | |||
| 185d8ed44f | |||
| d9836d4517 | |||
| 5f7e8a916a | |||
| 4dbdf4a294 | |||
| c6873c4e6d | |||
| 2111b4643c | |||
| c50901f3b9 | |||
| 8229280a9c | |||
| f77df94647 | |||
| f231e5bc21 | |||
| 2161efe978 | |||
| f23b4c04fd | |||
| 93540958b8 | |||
| 44b9af5bb2 | |||
| 7cd95dc8a3 | |||
| c02058c222 | |||
| b2ea5ba677 | |||
| 824a3f403f | |||
| 05f6846ede | |||
| 20db99cc69 | |||
| 6431be808f | |||
| 4727a8afa7 | |||
| b8f603cebe | |||
| fc679696f8 | |||
| ab5e7d93f4 | |||
| 0340f45553 | |||
| 19a00eb210 | |||
| 391612e78b | |||
| 77c95f72f7 | |||
| 59f30d0448 | |||
| 43c146ca42 | |||
| 7c2ec0fe87 | |||
| 039b6bade3 | |||
| 6c04638214 | |||
| 91ac7f764d | |||
| 4be7d7c1c9 | |||
| 59b477645c | |||
| 778f554157 | |||
| d3c84297c3 | |||
| f509a20846 | |||
| 60bc25e74c | |||
| b893d661b1 | |||
| 6b6e98775f | |||
| 9c3c21c519 | |||
| 512b8affa4 | |||
| 1c0c68202c | |||
| 5f317530ec | |||
| 557b2e961d | |||
| 4e256cadc2 | |||
| d6953beb91 | |||
| 17edd8a807 | |||
| 3303cfb4ac | |||
| b7e8e4e6be | |||
| 432e1cbc23 | |||
| 201c971e96 | |||
| e0986ea07b | |||
| a964e5e6c3 | |||
| 78c1d5bfd2 | |||
| 59a85c366e | |||
| 119f00630b | |||
| a42d2df75f | |||
| 5c057e068f | |||
| ed3aeb25a4 | |||
| 86ee949128 | |||
| 4570535ec4 | |||
| 2a6dc67eb5 | |||
| f05fea1f5e | |||
| d0df145c2a | |||
| 1838cd4860 | |||
| 7d6b03381e | |||
| 7c2e91c4e0 | |||
| 736fbf4c89 | |||
| 44ea85137a | |||
| d3d649efec | |||
| ea507c3a93 | |||
| 9705fba7b7 | |||
| 2f7dbc9b42 | |||
| ea25a76c05 | |||
| 67bc0c003e | |||
| 5a05f26603 | |||
| 7ef40bb983 | |||
| 767cbb011d | |||
| 7cfa4b24bf | |||
| b71fcd4905 | |||
| 75003f34e8 | |||
| 78b8015a4d | |||
| 831b124151 | |||
| c1ffcb55da | |||
| 0879736aab | |||
| a26917332f | |||
| cd9e5b8340 | |||
| 300a59c4c3 | |||
| d76541a6c5 | |||
| dd96465fd7 | |||
| 4f8f47e87e | |||
| d78fda7cda | |||
| 73a99cc2a5 | |||
| adae0c1f43 | |||
| cbf9221992 | |||
| 5f42fc53b6 | |||
| 8ee846c27c | |||
| 812b7f54a8 | |||
| 5f2cacdb1e | |||
| aa5053e3fe | |||
| 79aa244678 | |||
| 2ed3f20dba | |||
| 48f309029a | |||
| 0e93ac0b3a | |||
| 5446ad1d24 | |||
| f9a8084e48 | |||
| 3e70e3d4d5 | |||
| eb0fa43868 | |||
| 0ad9951c41 | |||
| 8c9117181d | |||
| c4b48d3c0f | |||
| 10d765482d | |||
| 39b643dc1a | |||
| 711f485643 | |||
| 9c5ee91b2a | |||
| 27edd2aeb4 | |||
| e5017cd6d6 | |||
| 6a7796e871 | |||
| 47b9339546 | |||
| 5d5146eee3 | |||
| 2aaa423842 | |||
| ad2d788016 | |||
| 36ce76c632 | |||
| f1fc2107a3 | |||
| 13cdc02173 | |||
| 502640c3f9 | |||
| 3d5f1c8640 | |||
| 1cab2f9cad | |||
| 1e50f1be70 | |||
| ad87ba927a | |||
| decf7f794b | |||
| d00d652998 | |||
| 3b279a84be | |||
| 5e4a8223c6 | |||
| e51de388a2 | |||
| cc253b73d3 | |||
| 7d6fb905d9 | |||
| 418d111f8c | |||
| be8921fbba | |||
| d4e7a1152d | |||
| be22bb6f3d | |||
| 169313b9f8 | |||
| 0b018d8baf | |||
| c31246800c | |||
| 4134312b35 | |||
| da554f932e | |||
| aac622e0cd | |||
| 1726e93ef1 | |||
| ee04c0cd04 | |||
| c36f0aa300 | |||
| 5234dc7451 | |||
| 3b7c20a6b5 | |||
| f9e714813a | |||
| 2518230d3e | |||
| a332b84578 | |||
| 1405f0c7ba | |||
| 84d57342b6 | |||
| 57b46d769e | |||
| f48b6a03ba | |||
| 2a69ab4899 | |||
| 8d7da92fd7 | |||
| e952eee698 | |||
| 66bca9b8bd | |||
| 99028fda44 | |||
| 1244948885 | |||
| a73f6491c8 | |||
| 001e50c92c | |||
| 96ebcaa3ad | |||
| 5db1870bb9 | |||
| 2ce26b9b5d | |||
| a388252ac4 | |||
| 9a9f48dff7 | |||
| 67f3fb0844 | |||
| 43b752c325 | |||
| cfd302db9b | |||
| fb610ae684 | |||
| 2f652e6cdf | |||
| e6a226efba | |||
| a2e6fa7e03 | |||
| 9f1c4ecaf2 | |||
| ef283548f7 | |||
| f4db5e6de1 | |||
| 099aaee536 | |||
| 35fe398c7c | |||
| bb6d43047e | |||
| bc546f76a1 | |||
| 80608ba5af | |||
| e184c9c510 | |||
| d7e34b4210 | |||
| ef6e0e7132 | |||
| 1ad3aca682 | |||
| 8d0afa9b42 | |||
| fa7e254a7f | |||
| e23cacda35 | |||
| 2e1b8bc2b6 | |||
| e47433b3c1 | |||
| 23194d83e8 | |||
| 61aedb5ffe | |||
| d3bd171123 | |||
| 89e4050af4 | |||
| 78a47f87ce | |||
| 6a113d9aed | |||
| 2e4fe48c37 | |||
| 8eb0a1d906 | |||
| fea3e476aa | |||
| 61a3431613 | |||
| 9bedac9623 | |||
| c42ff4f4fd | |||
| d5ab28511c | |||
| e61eb5e09d | |||
| 0899ba5b42 | |||
| 145ac73317 | |||
| d0d138bc55 | |||
| 43227236ec | |||
| 8616300ae2 | |||
| edbaadd91f | |||
| 9360d34fa1 | |||
| 1b67b04656 | |||
| bd51f78e39 | |||
| 65ecb4f134 | |||
| 143844fa43 | |||
| 219cfbe7f6 | |||
| 9b44a7d926 | |||
| a3ae45a38c | |||
| 0307428d65 | |||
| 471997adf6 | |||
| b1ded114b9 | |||
| f4e4088c99 | |||
| 0efd540dbc | |||
| 6144754014 | |||
| 69311446ba | |||
| da63274d9f | |||
| c216119d64 | |||
| 5546acb463 | |||
| c0ec81836f | |||
| b65e56babe | |||
| 49996cd597 | |||
| ecb37e276a | |||
| a5354b3ed2 | |||
| f9df8b4ad7 | |||
| ec152c8748 | |||
| 7977e5027c | |||
| 3f5d902d2a | |||
| 27d7638b94 | |||
| 176173989a | |||
| 23b8ee672d | |||
| 3939152069 | |||
| cd87bfbf37 | |||
| b3613e3ace | |||
| d346ec695e | |||
| c242c98031 | |||
| f1d53d150c | |||
| 92da847cf5 | |||
| 3958b96bf5 | |||
| 8bf8f45822 | |||
| 6f5c0931c1 | |||
| 4e33a7ea85 | |||
| dc48ba0c75 | |||
| 4778b42660 | |||
| c70ac4b8ff | |||
| cf89202855 | |||
| f075693da7 | |||
| f708bd4904 | |||
| 0002b7f0d1 | |||
| 11aafd9886 | 
| @ -5,11 +5,11 @@ import os | ||||
| import sys | ||||
| import zipfile | ||||
|  | ||||
| # Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 450 MiB | ||||
| # Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 500 MiB | ||||
| # Note that we have 800 MiB quota, please use it wisely. | ||||
| # See https://github.com/pypi/support/issues/6326 . | ||||
| # Please also sync the value with the one in Dockerfile. | ||||
| VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 450)) | ||||
| VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 500)) | ||||
|  | ||||
|  | ||||
| def print_top_10_largest_files(zip_file): | ||||
|  | ||||
							
								
								
									
										12
									
								
								.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										12
									
								
								.buildkite/lm-eval-harness/configs/Meta-Llama-3-8B-QQQ.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,12 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m HandH1998/QQQ-Llama-3-8b-g128 -b 32 -l 1000 -f 5 -t 1 | ||||
| model_name: "HandH1998/QQQ-Llama-3-8b-g128" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|   metrics: | ||||
|   - name: "exact_match,strict-match" | ||||
|     value: 0.419 | ||||
|   - name: "exact_match,flexible-extract" | ||||
|     value: 0.416 | ||||
| limit: 1000 | ||||
| num_fewshot: 5 | ||||
| @ -0,0 +1,12 @@ | ||||
| # For hf script, without -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 100 -t 8 | ||||
| model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" | ||||
| backend: "vllm-vlm" | ||||
| tasks: | ||||
| - name: "chartqa" | ||||
|   metrics: | ||||
|   - name: "relaxed_accuracy,none" | ||||
|     # TODO(zhewenl): model card is 0.90, but the actual score is 0.80. | ||||
|     value: 0.80 | ||||
| limit: 100 | ||||
| num_fewshot: 0 | ||||
| @ -0,0 +1,10 @@ | ||||
| # For hf script, without -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-mmlupro-vllm-baseline.sh -m meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8 -l 250 -t 8 -f 5 | ||||
| model_name: "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" | ||||
| tasks: | ||||
| - name: "mmlu_pro" | ||||
|   metrics: | ||||
|   - name: "exact_match,custom-extract" | ||||
|     value: 0.80 | ||||
| limit: 250 # will run on 250 * 14 subjects = 3500 samples | ||||
| num_fewshot: 5 | ||||
| @ -1,4 +1,5 @@ | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -b auto -l 1319 -f 5 -t 1 | ||||
| # For vllm script, with -t option (tensor parallel size) | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic -l 1319 -t 1 | ||||
| model_name: "RedHatAI/Qwen2.5-VL-3B-Instruct-FP8-Dynamic" | ||||
| tasks: | ||||
| - name: "gsm8k" | ||||
|  | ||||
| @ -0,0 +1,12 @@ | ||||
| # For vllm script, with -t option (tensor parallel size). | ||||
| # bash .buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh -m Qwen/Qwen2.5-VL-7B-Instruct -l 2500 -t 1 | ||||
|  | ||||
| model_name: "Qwen/Qwen2.5-VL-7B-Instruct" | ||||
| backend: "vllm-vlm" | ||||
| tasks: | ||||
| - name: "chartqa" | ||||
|   metrics: | ||||
|   - name: "relaxed_accuracy,none" | ||||
|     value: 0.855 | ||||
| limit: 2500 | ||||
| num_fewshot: 0 | ||||
							
								
								
									
										1
									
								
								.buildkite/lm-eval-harness/configs/models-large-h100.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								.buildkite/lm-eval-harness/configs/models-large-h100.txt
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1 @@ | ||||
| Meta-Llama-4-Maverick-17B-128E-Instruct-FP8.yaml | ||||
| @ -0,0 +1 @@ | ||||
| Meta-Llama-4-Maverick-17B-128E-Instruct-FP8-MM.yaml | ||||
							
								
								
									
										1
									
								
								.buildkite/lm-eval-harness/configs/models-mm-small.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								.buildkite/lm-eval-harness/configs/models-mm-small.txt
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1 @@ | ||||
| Qwen2.5-VL-7B-Instruct.yaml | ||||
							
								
								
									
										44
									
								
								.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										44
									
								
								.buildkite/lm-eval-harness/run-lm-eval-chartqa-vllm-vlm-baseline.sh
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,44 @@ | ||||
| #!/bin/bash | ||||
| # We can use this script to compute baseline accuracy on chartqa for vllm. | ||||
| # | ||||
| # Make sure you have lm-eval-harness installed: | ||||
| #   pip install lm-eval==0.4.9 | ||||
|  | ||||
| usage() { | ||||
|     echo`` | ||||
|     echo "Runs lm eval harness on ChartQA using multimodal vllm." | ||||
|     echo "This pathway is intended to be used to create baselines for " | ||||
|     echo "our correctness tests in vllm's CI." | ||||
|     echo | ||||
|     echo "usage: ${0} <options>" | ||||
|     echo | ||||
|     echo "  -m    - huggingface stub or local directory of the model" | ||||
|     echo "  -l    - limit number of samples to run" | ||||
|     echo "  -t    - tensor parallel size to run at" | ||||
|     echo | ||||
| } | ||||
|  | ||||
| while getopts "m:l:t:" OPT; do | ||||
|   case ${OPT} in | ||||
|     m )  | ||||
|         MODEL="$OPTARG" | ||||
|         ;; | ||||
|     l )  | ||||
|         LIMIT="$OPTARG" | ||||
|         ;; | ||||
|     t )  | ||||
|         TP_SIZE="$OPTARG" | ||||
|         ;; | ||||
|     \? )  | ||||
|         usage | ||||
|         exit 1 | ||||
|         ;; | ||||
|   esac | ||||
| done | ||||
|  | ||||
| lm_eval --model vllm-vlm \ | ||||
|   --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE" \ | ||||
|   --tasks chartqa \ | ||||
|   --batch_size auto \ | ||||
|   --apply_chat_template \ | ||||
|   --limit $LIMIT | ||||
							
								
								
									
										0
									
								
								.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							
							
						
						
									
										0
									
								
								.buildkite/lm-eval-harness/run-lm-eval-gsm-hf-baseline.sh
									
									
									
									
									
										
										
										Normal file → Executable file
									
								
							| @ -0,0 +1,50 @@ | ||||
| #!/bin/bash | ||||
| # We can use this script to compute baseline accuracy on MMLUPRO for vllm. | ||||
| # We use this for fp8, which HF does not support. | ||||
| # | ||||
| # Make sure you have lm-eval-harness installed: | ||||
| #   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d#egg=lm-eval[api] | ||||
|  | ||||
| usage() { | ||||
|     echo`` | ||||
|     echo "Runs lm eval harness on MMLU Pro using huggingface transformers." | ||||
|     echo "This pathway is intended to be used to create baselines for " | ||||
|     echo "our automated nm-test-accuracy workflow" | ||||
|     echo | ||||
|     echo "usage: ${0} <options>" | ||||
|     echo | ||||
|     echo "  -m    - huggingface stub or local directory of the model" | ||||
|     echo "  -l    - limit number of samples to run" | ||||
|     echo "  -f    - number of fewshot samples to use" | ||||
|     echo "  -t    - tensor parallel size to run at" | ||||
|     echo | ||||
| } | ||||
|  | ||||
| while getopts "m:b:l:f:t:" OPT; do | ||||
|   case ${OPT} in | ||||
|     m ) | ||||
|         MODEL="$OPTARG" | ||||
|         ;; | ||||
|     b ) | ||||
|         BATCH_SIZE="$OPTARG" | ||||
|         ;; | ||||
|     l ) | ||||
|         LIMIT="$OPTARG" | ||||
|         ;; | ||||
|     f ) | ||||
|         FEWSHOT="$OPTARG" | ||||
|         ;; | ||||
|     t ) | ||||
|         TP_SIZE="$OPTARG" | ||||
|         ;; | ||||
|     \? ) | ||||
|         usage | ||||
|         exit 1 | ||||
|         ;; | ||||
|   esac | ||||
| done | ||||
|  | ||||
| lm_eval --model vllm \ | ||||
|   --model_args "pretrained=$MODEL,tensor_parallel_size=$TP_SIZE,add_bos_token=true,trust_remote_code=true,max_model_len=4096" \ | ||||
|   --tasks mmlu_pro --num_fewshot "$FEWSHOT" --limit "$LIMIT" \ | ||||
|   --batch_size auto | ||||
| @ -19,21 +19,27 @@ RTOL = 0.08 | ||||
| def launch_lm_eval(eval_config, tp_size): | ||||
|     trust_remote_code = eval_config.get("trust_remote_code", False) | ||||
|     max_model_len = eval_config.get("max_model_len", 4096) | ||||
|     batch_size = eval_config.get("batch_size", "auto") | ||||
|     backend = eval_config.get("backend", "vllm") | ||||
|     model_args = ( | ||||
|         f"pretrained={eval_config['model_name']}," | ||||
|         f"tensor_parallel_size={tp_size}," | ||||
|         f"enforce_eager=true," | ||||
|         f"add_bos_token=true," | ||||
|         f"trust_remote_code={trust_remote_code}," | ||||
|         f"max_model_len={max_model_len}" | ||||
|         f"max_model_len={max_model_len}," | ||||
|     ) | ||||
|     results = lm_eval.simple_evaluate( | ||||
|         model="vllm", | ||||
|         model=backend, | ||||
|         model_args=model_args, | ||||
|         tasks=[task["name"] for task in eval_config["tasks"]], | ||||
|         num_fewshot=eval_config["num_fewshot"], | ||||
|         limit=eval_config["limit"], | ||||
|         batch_size="auto", | ||||
|         # TODO(yeq): using chat template w/ fewshot_as_multiturn is supposed help | ||||
|         # text models. however, this is regressing measured strict-match for | ||||
|         # existing text models in CI, so only apply it for mm. | ||||
|         apply_chat_template=backend == "vllm-vlm", | ||||
|         batch_size=batch_size, | ||||
|     ) | ||||
|     return results | ||||
|  | ||||
|  | ||||
| @ -368,7 +368,7 @@ if __name__ == "__main__": | ||||
|         # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...", | ||||
|         # we want to turn it into "8xGPUTYPE" | ||||
|         df["GPU"] = df["GPU"].apply( | ||||
|             lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}" | ||||
|             lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}" | ||||
|         ) | ||||
|  | ||||
|     # get markdown tables | ||||
|  | ||||
| @ -181,18 +181,14 @@ launch_vllm_server() { | ||||
|   if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then | ||||
|     echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience." | ||||
|     model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model') | ||||
|     server_command="python3 \ | ||||
|         -m vllm.entrypoints.openai.api_server \ | ||||
|     server_command="vllm serve $model \ | ||||
|         -tp $tp \ | ||||
|         --model $model \ | ||||
|         --port $port \ | ||||
|         $server_args" | ||||
|   else | ||||
|     echo "Key 'fp8' does not exist in common params." | ||||
|     server_command="python3 \ | ||||
|         -m vllm.entrypoints.openai.api_server \ | ||||
|     server_command="vllm serve $model \ | ||||
|         -tp $tp \ | ||||
|         --model $model \ | ||||
|         --port $port \ | ||||
|         $server_args" | ||||
|   fi | ||||
|  | ||||
| @ -365,8 +365,7 @@ run_serving_tests() { | ||||
|       continue | ||||
|     fi | ||||
|  | ||||
|     server_command="$server_envs python3 \ | ||||
|       -m vllm.entrypoints.openai.api_server \ | ||||
|     server_command="$server_envs vllm serve \ | ||||
|       $server_args" | ||||
|  | ||||
|     # run the server | ||||
| @ -455,11 +454,6 @@ main() { | ||||
|   fi | ||||
|   check_hf_token | ||||
|  | ||||
|   # Set to v1 to run v1 benchmark | ||||
|   if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then | ||||
|     export VLLM_USE_V1=1 | ||||
|   fi | ||||
|  | ||||
|   # dependencies | ||||
|   (which wget && which curl) || (apt-get update && apt-get install -y wget curl) | ||||
|   (which jq) || (apt-get update && apt-get -y install jq) | ||||
|  | ||||
| @ -1,46 +0,0 @@ | ||||
| # This local pyproject file is part of the migration from yapf to ruff format. | ||||
| # It uses the same core rules as the main pyproject.toml file, but with the | ||||
| # following differences: | ||||
| # - ruff line length is overridden to 88 | ||||
| # - deprecated typing ignores (UP006, UP035) have been removed | ||||
|  | ||||
| [tool.ruff] | ||||
| line-length = 88 | ||||
|  | ||||
| [tool.ruff.lint.per-file-ignores] | ||||
| "vllm/third_party/**" = ["ALL"] | ||||
| "vllm/version.py" = ["F401"] | ||||
| "vllm/_version.py" = ["ALL"] | ||||
|  | ||||
| [tool.ruff.lint] | ||||
| select = [ | ||||
|     # pycodestyle | ||||
|     "E", | ||||
|     # Pyflakes | ||||
|     "F", | ||||
|     # pyupgrade | ||||
|     "UP", | ||||
|     # flake8-bugbear | ||||
|     "B", | ||||
|     # flake8-simplify | ||||
|     "SIM", | ||||
|     # isort | ||||
|     "I", | ||||
|     # flake8-logging-format | ||||
|     "G", | ||||
| ] | ||||
| ignore = [ | ||||
|     # star imports | ||||
|     "F405", "F403", | ||||
|     # lambda expression assignment | ||||
|     "E731", | ||||
|     # Loop control variable not used within loop body | ||||
|     "B007", | ||||
|     # f-string format | ||||
|     "UP032", | ||||
|     # Can remove once 3.10+ is the minimum Python version | ||||
|     "UP007", | ||||
| ] | ||||
|  | ||||
| [tool.ruff.format] | ||||
| docstring-code-format = true | ||||
| @ -8,7 +8,7 @@ steps: | ||||
|     commands: | ||||
|       # #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here: | ||||
|       # https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7 | ||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." | ||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg VLLM_MAIN_CUDA_VERSION=12.9 --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." | ||||
|       - "mkdir artifacts" | ||||
|       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" | ||||
|       - "bash .buildkite/scripts/upload-wheels.sh" | ||||
| @ -48,7 +48,7 @@ steps: | ||||
|     agents: | ||||
|       queue: cpu_queue_postmerge | ||||
|     commands: | ||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." | ||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ." | ||||
|       - "mkdir artifacts" | ||||
|       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'" | ||||
|       - "bash .buildkite/scripts/upload-wheels.sh" | ||||
| @ -76,7 +76,7 @@ steps: | ||||
|       queue: arm64_cpu_queue_postmerge | ||||
|     commands: | ||||
|       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" | ||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." | ||||
|       - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 8.9 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ." | ||||
|       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)" | ||||
|  | ||||
|   # Add job to create multi-arch manifest | ||||
| @ -150,11 +150,16 @@ steps: | ||||
|       queue: cpu_queue_postmerge | ||||
|     commands: | ||||
|       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7" | ||||
|       - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT" | ||||
|       - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly" | ||||
|       - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT" | ||||
|       - "docker push vllm/vllm-openai:nightly" | ||||
|       - "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT" | ||||
|       - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64" | ||||
|       - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64" | ||||
|       - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64" | ||||
|       - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64" | ||||
|       - "docker push vllm/vllm-openai:nightly-x86_64" | ||||
|       - "docker push vllm/vllm-openai:nightly-aarch64" | ||||
|       - "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend" | ||||
|       - "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend" | ||||
|       - "docker manifest push vllm/vllm-openai:nightly" | ||||
|       - "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT" | ||||
|       # Clean up old nightly builds (keep only last 14) | ||||
|       - "bash .buildkite/scripts/cleanup-nightly-builds.sh" | ||||
|     plugins: | ||||
| @ -163,3 +168,4 @@ steps: | ||||
|           password-env: DOCKERHUB_TOKEN | ||||
|     env: | ||||
|       DOCKER_BUILDKIT: "1" | ||||
|       DOCKERHUB_USERNAME: "vllmbot" | ||||
|  | ||||
| @ -8,20 +8,41 @@ set -ex | ||||
| # DockerHub API endpoint for vllm/vllm-openai repository | ||||
| REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags" | ||||
|  | ||||
| # Get DockerHub token from environment | ||||
| # Get DockerHub credentials from environment | ||||
| if [ -z "$DOCKERHUB_TOKEN" ]; then | ||||
|     echo "Error: DOCKERHUB_TOKEN environment variable is not set" | ||||
|     exit 1 | ||||
| fi | ||||
|  | ||||
| if [ -z "$DOCKERHUB_USERNAME" ]; then | ||||
|     echo "Error: DOCKERHUB_USERNAME environment variable is not set" | ||||
|     exit 1 | ||||
| fi | ||||
|  | ||||
| # Get DockerHub bearer token | ||||
| echo "Getting DockerHub bearer token..." | ||||
| set +x | ||||
| BEARER_TOKEN=$(curl -s -X POST \ | ||||
|     -H "Content-Type: application/json" \ | ||||
|     -d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \ | ||||
|     "https://hub.docker.com/v2/users/login" | jq -r '.token') | ||||
| set -x | ||||
|  | ||||
| if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then | ||||
|     echo "Error: Failed to get DockerHub bearer token" | ||||
|     exit 1 | ||||
| fi | ||||
|  | ||||
| # Function to get all tags from DockerHub | ||||
| get_all_tags() { | ||||
|     local page=1 | ||||
|     local all_tags="" | ||||
|      | ||||
|     while true; do | ||||
|         local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \ | ||||
|         set +x | ||||
|         local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \ | ||||
|             "$REPO_API_URL?page=$page&page_size=100") | ||||
|         set -x | ||||
|          | ||||
|         # Get both last_updated timestamp and tag name, separated by | | ||||
|         local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"') | ||||
| @ -43,7 +64,9 @@ delete_tag() { | ||||
|     echo "Deleting tag: $tag_name" | ||||
|      | ||||
|     local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name" | ||||
|     local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url") | ||||
|     set +x | ||||
|     local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url") | ||||
|     set -x | ||||
|      | ||||
|     if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then | ||||
|         echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')" | ||||
|  | ||||
| @ -25,25 +25,28 @@ function cpu_tests() { | ||||
|  | ||||
|   # offline inference | ||||
|   podman exec -it "$container_id" bash -c " | ||||
|     set -e | ||||
|     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" | ||||
|     set -xve | ||||
|     python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m" >> $HOME/test_basic.log | ||||
|  | ||||
|   # Run basic model test | ||||
|   podman exec -it "$container_id" bash -c " | ||||
|     set -e | ||||
|     set -evx | ||||
|     pip install pytest pytest-asyncio einops peft Pillow soundfile transformers_stream_generator matplotlib | ||||
|     pip install sentence-transformers datamodel_code_generator | ||||
|     pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model | ||||
|  | ||||
|     # Note: disable Bart until supports V1 | ||||
|     # pytest -v -s tests/models/language/generation/test_bart.py -m cpu_model | ||||
|     pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-openai-community/gpt2] | ||||
|     pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m] | ||||
|     pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it] | ||||
|     pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach] | ||||
|     pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" | ||||
|     # TODO: Below test case tests/models/language/pooling/test_embedding.py::test_models[True-ssmits/Qwen2-7B-Instruct-embed-base] fails on ppc64le. Disabling it for time being. | ||||
|     # pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model" >> $HOME/test_rest.log | ||||
| } | ||||
|  | ||||
| # All of CPU tests are expected to be finished less than 40 mins. | ||||
|  | ||||
| export container_id | ||||
| export -f cpu_tests | ||||
| timeout 40m bash -c cpu_tests | ||||
| timeout 120m bash -c cpu_tests | ||||
|  | ||||
|  | ||||
| @ -70,7 +70,7 @@ function cpu_tests() { | ||||
|   docker exec cpu-test-"$NUMA_NODE" bash -c " | ||||
|     set -e | ||||
|     pytest -x -s -v \ | ||||
|     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs[False-10-32-neuralmagic/Llama-3.2-1B-quantized.w8a8]" | ||||
|     tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_logprobs" | ||||
|  | ||||
|   # Note: disable it until supports V1 | ||||
|   # Run AWQ test | ||||
|  | ||||
							
								
								
									
										191
									
								
								.buildkite/scripts/hardware_ci/run-npu-test.sh
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										191
									
								
								.buildkite/scripts/hardware_ci/run-npu-test.sh
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,191 @@ | ||||
| #!/bin/bash | ||||
|  | ||||
| # This script build the Ascend NPU docker image and run the offline inference inside the container. | ||||
| # It serves a sanity check for compilation and basic model usage. | ||||
| set -ex | ||||
|  | ||||
| # Base ubuntu image with basic ascend development libraries and python installed | ||||
| VLLM_ASCEND_REPO="https://github.com/vllm-project/vllm-ascend.git" | ||||
| CONFIG_FILE_REMOTE_PATH="tests/e2e/vllm_interface/vllm_test.cfg" | ||||
| TEST_RUN_CONFIG_FILE="vllm_test.cfg" | ||||
| VLLM_ASCEND_TMP_DIR= | ||||
| # Get the test run configuration file from the vllm-ascend repository | ||||
| fetch_vllm_test_cfg() { | ||||
|     VLLM_ASCEND_TMP_DIR=$(mktemp -d) | ||||
|     # Ensure that the temporary directory is cleaned up when an exception occurs during configuration file retrieval | ||||
|     cleanup() { | ||||
|         rm -rf "${VLLM_ASCEND_TMP_DIR}" | ||||
|     } | ||||
|     trap cleanup EXIT | ||||
|  | ||||
|     GIT_TRACE=1 git clone -v --depth 1 "${VLLM_ASCEND_REPO}" "${VLLM_ASCEND_TMP_DIR}" | ||||
|     if [ ! -f "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" ]; then | ||||
|         echo "Error: file '${CONFIG_FILE_REMOTE_PATH}' does not exist in the warehouse" >&2 | ||||
|         exit 1 | ||||
|     fi | ||||
|  | ||||
|     # If the file already exists locally, just overwrite it | ||||
|     cp "${VLLM_ASCEND_TMP_DIR}/${CONFIG_FILE_REMOTE_PATH}" "${TEST_RUN_CONFIG_FILE}" | ||||
|     echo "Copied ${CONFIG_FILE_REMOTE_PATH} to ${TEST_RUN_CONFIG_FILE}" | ||||
|  | ||||
|     # Since the trap will be overwritten later, and when it is executed here, the task of cleaning up resources | ||||
|     # when the trap is abnormal has been completed, so the temporary resources are manually deleted here. | ||||
|     rm -rf "${VLLM_ASCEND_TMP_DIR}" | ||||
|     trap - EXIT | ||||
| } | ||||
|  | ||||
| # Downloads test run configuration file from a remote URL. | ||||
| # Loads the configuration into the current script environment. | ||||
| get_config() { | ||||
|     if [ ! -f "${TEST_RUN_CONFIG_FILE}" ]; then | ||||
|         echo "Error: file '${TEST_RUN_CONFIG_FILE}' does not exist in the warehouse" >&2 | ||||
|         exit 1 | ||||
|     fi | ||||
|     source "${TEST_RUN_CONFIG_FILE}" | ||||
|     echo "Base docker image name that get from configuration: ${BASE_IMAGE_NAME}" | ||||
|     return 0 | ||||
| } | ||||
|  | ||||
| # get test running configuration. | ||||
| fetch_vllm_test_cfg | ||||
| get_config | ||||
| # Check if the function call was successful. If not, exit the script. | ||||
| if [ $? -ne 0 ]; then | ||||
|   exit 1 | ||||
| fi | ||||
|  | ||||
| image_name="npu/vllm-ci:${BUILDKITE_COMMIT}_${EPOCHSECONDS}" | ||||
| container_name="npu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" | ||||
|  | ||||
| # BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards | ||||
| agent_idx=$(echo "${BUILDKITE_AGENT_NAME}" | awk -F'-' '{print $(NF-1)}') | ||||
| echo "agent_idx: ${agent_idx}" | ||||
| builder_name="cachebuilder${agent_idx}" | ||||
| builder_cache_dir="/mnt/docker-cache${agent_idx}" | ||||
| mkdir -p ${builder_cache_dir} | ||||
|  | ||||
| # Try building the docker image | ||||
| cat <<EOF | DOCKER_BUILDKIT=1 docker build \ | ||||
|     --add-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_HOST} \ | ||||
|     --builder ${builder_name} --cache-from type=local,src=${builder_cache_dir} \ | ||||
|                            --cache-to type=local,dest=${builder_cache_dir},mode=max \ | ||||
|     --progress=plain --load -t ${image_name} -f - . | ||||
| FROM ${BASE_IMAGE_NAME} | ||||
|  | ||||
| # Define environments | ||||
| ENV DEBIAN_FRONTEND=noninteractive | ||||
|  | ||||
| RUN pip config set global.index-url http://cache-service-vllm.nginx-pypi-cache.svc.cluster.local:${PYPI_CACHE_PORT}/pypi/simple && \ | ||||
|     pip config set global.trusted-host cache-service-vllm.nginx-pypi-cache.svc.cluster.local && \ | ||||
|     apt-get update -y && \ | ||||
|     apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \ | ||||
|     rm -rf /var/cache/apt/* && \ | ||||
|     rm -rf /var/lib/apt/lists/* | ||||
|  | ||||
| # Install for pytest to make the docker build cache layer always valid | ||||
| RUN --mount=type=cache,target=/root/.cache/pip \ | ||||
|     pip install pytest>=6.0  modelscope | ||||
|  | ||||
| WORKDIR /workspace/vllm | ||||
|  | ||||
| # Install vLLM dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid. | ||||
| COPY requirements/common.txt /workspace/vllm/requirements/common.txt | ||||
| RUN --mount=type=cache,target=/root/.cache/pip \ | ||||
|     pip install -r requirements/common.txt | ||||
|  | ||||
| COPY . . | ||||
|  | ||||
| # Install vLLM | ||||
| RUN --mount=type=cache,target=/root/.cache/pip \ | ||||
|     VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \ | ||||
|     python3 -m pip uninstall -y triton | ||||
|  | ||||
| # Install vllm-ascend | ||||
| WORKDIR /workspace | ||||
| ARG VLLM_ASCEND_REPO=https://github.com/vllm-project/vllm-ascend.git | ||||
| ARG VLLM_ASCEND_TAG=main | ||||
| RUN git config --global url."https://gh-proxy.test.osinfra.cn/https://github.com/".insteadOf "https://github.com/" && \ | ||||
|     git clone --depth 1 \$VLLM_ASCEND_REPO --branch \$VLLM_ASCEND_TAG /workspace/vllm-ascend | ||||
|  | ||||
| # Install vllm dependencies in advance. Effect: As long as common.txt remains unchanged, the docker cache layer will be valid. | ||||
| RUN --mount=type=cache,target=/root/.cache/pip \ | ||||
|     pip install -r /workspace/vllm-ascend/requirements.txt | ||||
|  | ||||
| RUN --mount=type=cache,target=/root/.cache/pip \ | ||||
|     export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \ | ||||
|     source /usr/local/Ascend/ascend-toolkit/set_env.sh && \ | ||||
|     source /usr/local/Ascend/nnal/atb/set_env.sh && \ | ||||
|     export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \ | ||||
|     python3 -m pip install -v -e /workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ | ||||
|  | ||||
| ENV VLLM_WORKER_MULTIPROC_METHOD=spawn | ||||
| ENV VLLM_USE_MODELSCOPE=True | ||||
|  | ||||
| WORKDIR /workspace/vllm-ascend | ||||
|  | ||||
| CMD ["/bin/bash"] | ||||
|  | ||||
| EOF | ||||
|  | ||||
| # Setup cleanup | ||||
| remove_docker_container() { | ||||
|   docker rm -f "${container_name}" || true; | ||||
|   docker image rm -f "${image_name}" || true; | ||||
|   docker system prune -f || true; | ||||
| } | ||||
| trap remove_docker_container EXIT | ||||
|  | ||||
| # Generate corresponding --device args based on BUILDKITE_AGENT_NAME | ||||
| # Ascend NPU BUILDKITE_AGENT_NAME format is {hostname}-{agent_idx}-{npu_card_num}cards, and agent_idx starts from 1. | ||||
| #   e.g. atlas-a2-001-1-2cards means this is the 1-th agent on atlas-a2-001 host, and it has 2 NPU cards. | ||||
| #   returns --device /dev/davinci0 --device /dev/davinci1 | ||||
| parse_and_gen_devices() { | ||||
|     local input="$1" | ||||
|     local index cards_num | ||||
|     if [[ "$input" =~ ([0-9]+)-([0-9]+)cards$ ]]; then | ||||
|         index="${BASH_REMATCH[1]}" | ||||
|         cards_num="${BASH_REMATCH[2]}" | ||||
|     else | ||||
|         echo "parse error" >&2 | ||||
|         return 1 | ||||
|     fi | ||||
|  | ||||
|     local devices="" | ||||
|     local i=0 | ||||
|     while (( i < cards_num )); do | ||||
|         local dev_idx=$(((index - 1)*cards_num + i )) | ||||
|         devices="$devices --device /dev/davinci${dev_idx}" | ||||
|         ((i++)) | ||||
|     done | ||||
|  | ||||
|     # trim leading space | ||||
|     devices="${devices#"${devices%%[![:space:]]*}"}" | ||||
|     # Output devices: assigned to the caller variable | ||||
|     printf '%s' "$devices" | ||||
| } | ||||
|  | ||||
| devices=$(parse_and_gen_devices "${BUILDKITE_AGENT_NAME}") || exit 1 | ||||
|  | ||||
| # Run the image and execute the Out-Of-Tree (OOT) platform interface test case on Ascend NPU hardware. | ||||
| # This test checks whether the OOT platform interface is functioning properly in conjunction with | ||||
| # the hardware plugin vllm-ascend. | ||||
| model_cache_dir=/mnt/modelscope${agent_idx} | ||||
| mkdir -p ${model_cache_dir} | ||||
| docker run \ | ||||
|     ${devices} \ | ||||
|     --device /dev/davinci_manager \ | ||||
|     --device /dev/devmm_svm \ | ||||
|     --device /dev/hisi_hdc \ | ||||
|     -v /usr/local/dcmi:/usr/local/dcmi \ | ||||
|     -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \ | ||||
|     -v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \ | ||||
|     -v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \ | ||||
|     -v /etc/ascend_install.info:/etc/ascend_install.info \ | ||||
|     -v ${model_cache_dir}:/root/.cache/modelscope \ | ||||
|     --entrypoint="" \ | ||||
|     --name "${container_name}" \ | ||||
|     "${image_name}" \ | ||||
|     bash -c ' | ||||
|     set -e | ||||
|     pytest -v -s tests/e2e/vllm_interface/ | ||||
| ' | ||||
| @ -64,10 +64,9 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git | ||||
|     && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ | ||||
|     && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 | ||||
| echo "--- Python dependencies installed ---" | ||||
| export VLLM_USE_V1=1 | ||||
|  | ||||
| export VLLM_XLA_CHECK_RECOMPILATION=1 | ||||
| export VLLM_XLA_CACHE_PATH= | ||||
| echo "Using VLLM V1" | ||||
|  | ||||
| echo "--- Hardware Information ---" | ||||
| # tpu-info | ||||
|  | ||||
| @ -64,10 +64,9 @@ python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git | ||||
|     && python3 -m pip install --progress-bar off "lm-eval @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d" \ | ||||
|     && python3 -m pip install --progress-bar off hf-transfer tblib==3.1.0 | ||||
| echo "--- Python dependencies installed ---" | ||||
| export VLLM_USE_V1=1 | ||||
|  | ||||
| export VLLM_XLA_CHECK_RECOMPILATION=1 | ||||
| export VLLM_XLA_CACHE_PATH= | ||||
| echo "Using VLLM V1" | ||||
|  | ||||
| echo "--- Hardware Information ---" | ||||
| # tpu-info | ||||
|  | ||||
| @ -42,9 +42,7 @@ docker run \ | ||||
|     pytest -v -s v1/sample --ignore=v1/sample/test_logprobs.py --ignore=v1/sample/test_logprobs_e2e.py | ||||
|     pytest -v -s v1/worker --ignore=v1/worker/test_gpu_model_runner.py | ||||
|     pytest -v -s v1/structured_output | ||||
|     pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_eagle.py --ignore=v1/spec_decode/test_tree_attention.py | ||||
|     pytest -v -s v1/spec_decode --ignore=v1/spec_decode/test_max_len.py --ignore=v1/spec_decode/test_tree_attention.py | ||||
|     pytest -v -s v1/kv_connector/unit --ignore=v1/kv_connector/unit/test_multi_connector.py --ignore=v1/kv_connector/unit/test_nixl_connector.py --ignore=v1/kv_connector/unit/test_shared_storage_connector.py | ||||
|     pytest -v -s v1/test_serial_utils.py | ||||
|     pytest -v -s v1/test_utils.py | ||||
|     pytest -v -s v1/test_metrics_reader.py | ||||
| ' | ||||
|  | ||||
| @ -18,7 +18,7 @@ vllm bench throughput --input-len 256 --output-len 256 --output-json throughput_ | ||||
| bench_throughput_exit_code=$? | ||||
|  | ||||
| # run server-based benchmarks and upload the result to buildkite | ||||
| python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf & | ||||
| vllm serve meta-llama/Llama-2-7b-chat-hf & | ||||
| server_pid=$! | ||||
| wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json | ||||
|  | ||||
|  | ||||
| @ -9,6 +9,6 @@ MAX_NUM_BATCHED_TOKENS=1024 | ||||
| TENSOR_PARALLEL_SIZE=1 | ||||
| MAX_MODEL_LEN=2048 | ||||
| DOWNLOAD_DIR=/mnt/disks/persist | ||||
| EXPECTED_THROUGHPUT=10.0 | ||||
| EXPECTED_THROUGHPUT=8.7 | ||||
| INPUT_LEN=1800 | ||||
| OUTPUT_LEN=128 | ||||
|  | ||||
| @ -42,7 +42,7 @@ echo "lanching vllm..." | ||||
| echo "logging to $VLLM_LOG" | ||||
| echo | ||||
|  | ||||
| VLLM_USE_V1=1 vllm serve $MODEL \ | ||||
| vllm serve $MODEL \ | ||||
|  --seed 42 \ | ||||
|  --max-num-seqs $MAX_NUM_SEQS \ | ||||
|  --max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \ | ||||
|  | ||||
							
								
								
									
										1267
									
								
								.buildkite/test-amd.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1267
									
								
								.buildkite/test-amd.yaml
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @ -50,19 +50,28 @@ steps: | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   source_file_dependencies: | ||||
|   - vllm/ | ||||
|   - tests/multimodal | ||||
|   - tests/utils_ | ||||
|   commands: | ||||
|   - pytest -v -s -m 'not cpu_test' multimodal | ||||
|   - pytest -v -s utils_ | ||||
|  | ||||
| - label: Async Engine, Inputs, Utils, Worker Test (CPU) # 4 mins | ||||
|   timeout_in_minutes: 10 | ||||
|   source_file_dependencies: | ||||
|   - vllm/ | ||||
|   - tests/test_inputs.py | ||||
|   - tests/test_outputs.py | ||||
|   - tests/multimodal | ||||
|   - tests/utils_ | ||||
|   - tests/standalone_tests/lazy_imports.py | ||||
|   - tests/transformers_utils | ||||
|   no_gpu: true | ||||
|   commands: | ||||
|   - python3 standalone_tests/lazy_imports.py | ||||
|   - pytest -v -s test_inputs.py | ||||
|   - pytest -v -s test_outputs.py | ||||
|   - pytest -v -s multimodal | ||||
|   - pytest -v -s utils_ # Utils | ||||
|   - pytest -v -s transformers_utils # transformers_utils | ||||
|   - pytest -v -s -m 'cpu_test' multimodal | ||||
|   - pytest -v -s transformers_utils | ||||
|  | ||||
| - label: Python-only Installation Test # 10min | ||||
|   timeout_in_minutes: 20 | ||||
| @ -159,10 +168,7 @@ steps: | ||||
|   - examples/offline_inference/rlhf.py | ||||
|   - examples/offline_inference/rlhf_colocate.py | ||||
|   - tests/examples/offline_inference/data_parallel.py | ||||
|   - tests/v1/test_async_llm_dp.py | ||||
|   - tests/v1/test_external_lb_dp.py | ||||
|   - tests/v1/test_internal_lb_dp.py | ||||
|   - tests/v1/test_hybrid_lb_dp.py | ||||
|   - tests/v1/distributed | ||||
|   - tests/v1/engine/test_engine_core_client.py | ||||
|   - tests/distributed/test_symm_mem_allreduce.py | ||||
|   commands: | ||||
| @ -180,10 +186,10 @@ steps: | ||||
|   - TP_SIZE=2 DP_SIZE=2 ENABLE_EP=1 torchrun --nproc-per-node=4 distributed/test_torchrun_example_moe.py | ||||
|   # test with internal dp | ||||
|   - python3 ../examples/offline_inference/data_parallel.py --enforce-eager | ||||
|   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py | ||||
|   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py | ||||
|   - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_internal_lb_dp.py | ||||
|   - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/test_hybrid_lb_dp.py | ||||
|   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py | ||||
|   - TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py | ||||
|   - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py | ||||
|   - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_hybrid_lb_dp.py | ||||
|   - pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp | ||||
|   - pytest -v -s distributed/test_utils.py | ||||
|   - pytest -v -s compile/test_basic_correctness.py | ||||
| @ -290,26 +296,35 @@ steps: | ||||
|     - tests/v1 | ||||
|   commands: | ||||
|     # split the test to avoid interference | ||||
|     - pytest -v -s v1/core | ||||
|     - pytest -v -s -m 'not cpu_test' v1/core | ||||
|     - pytest -v -s v1/executor | ||||
|     - pytest -v -s v1/kv_offload | ||||
|     - pytest -v -s v1/sample | ||||
|     - pytest -v -s v1/logits_processors | ||||
|     - pytest -v -s v1/worker | ||||
|     - pytest -v -s v1/structured_output | ||||
|     - pytest -v -s v1/spec_decode | ||||
|     - pytest -v -s v1/kv_connector/unit | ||||
|     - pytest -v -s v1/metrics | ||||
|     - pytest -v -s v1/test_kv_sharing.py | ||||
|     - pytest -v -s v1/test_metrics_reader.py | ||||
|     - pytest -v -s -m 'not cpu_test' v1/kv_connector/unit | ||||
|     - pytest -v -s -m 'not cpu_test' v1/metrics | ||||
|     - pytest -v -s v1/test_oracle.py | ||||
|     - pytest -v -s v1/test_request.py | ||||
|     - pytest -v -s v1/test_serial_utils.py | ||||
|     - pytest -v -s v1/test_utils.py | ||||
|     # Integration test for streaming correctness (requires special branch). | ||||
|     - pip install -U git+https://github.com/robertgshaw2-redhat/lm-evaluation-harness.git@streaming-api | ||||
|     - pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine | ||||
|  | ||||
| - label: V1 Test others (CPU) # 5 mins | ||||
|   source_file_dependencies: | ||||
|     - vllm/ | ||||
|     - tests/v1 | ||||
|   no_gpu: true | ||||
|   commands: | ||||
|     # split the test to avoid interference | ||||
|     - pytest -v -s -m 'cpu_test' v1/core | ||||
|     - pytest -v -s v1/structured_output | ||||
|     - pytest -v -s v1/test_serial_utils.py | ||||
|     - pytest -v -s -m 'cpu_test' v1/kv_connector/unit | ||||
|     - pytest -v -s -m 'cpu_test' v1/metrics | ||||
|  | ||||
|  | ||||
| - label: Examples Test # 30min | ||||
|   timeout_in_minutes: 45 | ||||
|   mirror_hardwares: [amdexperimental] | ||||
| @ -383,12 +398,12 @@ steps: | ||||
|     - pytest -v -s compile/test_pass_manager.py | ||||
|     - pytest -v -s compile/test_fusion.py | ||||
|     - pytest -v -s compile/test_fusion_attn.py | ||||
|     - pytest -v -s compile/test_functionalization.py | ||||
|     - pytest -v -s compile/test_silu_mul_quant_fusion.py | ||||
|     - pytest -v -s compile/test_sequence_parallelism.py | ||||
|     - pytest -v -s compile/test_async_tp.py | ||||
|     - pytest -v -s compile/test_fusion_all_reduce.py | ||||
|     - pytest -v -s compile/test_decorator.py | ||||
|     - pytest -v -s compile/test_noop_elimination.py | ||||
|     - pytest -v -s compile/test_aot_compile.py | ||||
|  | ||||
| - label: PyTorch Fullgraph Smoke Test # 15min | ||||
|   timeout_in_minutes: 30 | ||||
| @ -401,8 +416,8 @@ steps: | ||||
|   - pytest -v -s compile/test_basic_correctness.py | ||||
|   - pytest -v -s compile/piecewise/ | ||||
|  | ||||
| - label: PyTorch Fullgraph Test # 20min | ||||
|   timeout_in_minutes: 30 | ||||
| - label: PyTorch Fullgraph Test # 22min | ||||
|   timeout_in_minutes: 35 | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   torch_nightly: true | ||||
|   source_file_dependencies: | ||||
| @ -410,6 +425,7 @@ steps: | ||||
|   - tests/compile | ||||
|   commands: | ||||
|   - pytest -v -s compile/test_full_graph.py | ||||
|   - pytest -v -s compile/test_fusions_e2e.py | ||||
|  | ||||
| - label: Kernels Core Operation Test # 48min | ||||
|   timeout_in_minutes: 75 | ||||
| @ -417,8 +433,9 @@ steps: | ||||
|   source_file_dependencies: | ||||
|   - csrc/ | ||||
|   - tests/kernels/core | ||||
|   - tests/kernels/test_top_k_per_row.py | ||||
|   commands: | ||||
|     - pytest -v -s kernels/core | ||||
|     - pytest -v -s kernels/core kernels/test_top_k_per_row.py | ||||
|  | ||||
| - label: Kernels Attention Test %N # 23min | ||||
|   timeout_in_minutes: 35 | ||||
| @ -462,32 +479,22 @@ steps: | ||||
|   source_file_dependencies: | ||||
|   - csrc/mamba/ | ||||
|   - tests/kernels/mamba | ||||
|   - vllm/model_executor/layers/mamba/ops | ||||
|   commands: | ||||
|     - pytest -v -s kernels/mamba | ||||
|  | ||||
| - label: Tensorizer Test # 14min | ||||
|   timeout_in_minutes: 25 | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   source_file_dependencies: | ||||
|   - vllm/model_executor/model_loader | ||||
|   - tests/tensorizer_loader | ||||
|   - tests/entrypoints/openai/test_tensorizer_entrypoint.py | ||||
|   commands: | ||||
|     - apt-get update && apt-get install -y curl libsodium23 | ||||
|     - export VLLM_WORKER_MULTIPROC_METHOD=spawn | ||||
|     - pytest -v -s tensorizer_loader | ||||
|     - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py | ||||
|  | ||||
| - label: Model Executor Test # 7min | ||||
|   timeout_in_minutes: 20 | ||||
| - label: Model Executor Test # 23min | ||||
|   timeout_in_minutes: 35 | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   source_file_dependencies: | ||||
|   - vllm/model_executor | ||||
|   - tests/model_executor | ||||
|   - tests/entrypoints/openai/test_tensorizer_entrypoint.py | ||||
|   commands: | ||||
|     - apt-get update && apt-get install -y curl libsodium23 | ||||
|     - export VLLM_WORKER_MULTIPROC_METHOD=spawn | ||||
|     - pytest -v -s model_executor | ||||
|     - pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py | ||||
|  | ||||
| - label: Benchmarks # 11min | ||||
|   timeout_in_minutes: 20 | ||||
| @ -521,8 +528,9 @@ steps: | ||||
|   # since torchao nightly is only compatible with torch nightly currently | ||||
|   # https://github.com/pytorch/ao/issues/2919, we'll have to skip new torchao tests for now | ||||
|   # we can only upgrade after this is resolved | ||||
|   - pip install --pre torchao==0.13.0.dev20250814 --index-url https://download.pytorch.org/whl/nightly/cu128 | ||||
|   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization | ||||
|   # TODO(jerryzh168): resolve the above comment | ||||
|   - uv pip install --system torchao==0.13.0 | ||||
|   - VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization/ --ignore quantization/test_blackwell_moe.py | ||||
|  | ||||
| - label: LM Eval Small Models # 53min | ||||
|   timeout_in_minutes: 75 | ||||
| @ -550,10 +558,17 @@ steps: | ||||
|   source_file_dependencies: | ||||
|     - vllm/ | ||||
|     - tests/tool_use | ||||
|     - tests/mistral_tool_use | ||||
|   commands: | ||||
|     - pytest -v -s tool_use | ||||
|     - pytest -v -s mistral_tool_use | ||||
|     - pytest -v -s -m 'not cpu_test' tool_use | ||||
|  | ||||
| - label: OpenAI-Compatible Tool Use (CPU) # 5 mins | ||||
|   timeout_in_minutes: 10 | ||||
|   source_file_dependencies: | ||||
|     - vllm/ | ||||
|     - tests/tool_use | ||||
|   no_gpu: true | ||||
|   commands: | ||||
|     - pytest -v -s -m 'cpu_test' tool_use | ||||
|  | ||||
| #####  models test  ##### | ||||
|  | ||||
| @ -593,13 +608,19 @@ steps: | ||||
|   - vllm/ | ||||
|   - tests/models/test_transformers.py | ||||
|   - tests/models/test_registry.py | ||||
|   commands: | ||||
|     - pytest -v -s models/test_transformers.py models/test_registry.py | ||||
|  | ||||
| - label: Basic Models Test (Other CPU) # 5min | ||||
|   timeout_in_minutes: 10 | ||||
|   torch_nightly: true | ||||
|   source_file_dependencies: | ||||
|   - vllm/ | ||||
|   - tests/models/test_utils.py | ||||
|   - tests/models/test_vision.py | ||||
|   no_gpu: true | ||||
|   commands: | ||||
|     - pytest -v -s models/test_transformers.py \ | ||||
|                    models/test_registry.py \ | ||||
|                    models/test_utils.py \ | ||||
|                    models/test_vision.py | ||||
|     - pytest -v -s models/test_utils.py models/test_vision.py | ||||
|  | ||||
| - label: Language Models Tests (Standard) | ||||
|   timeout_in_minutes: 25 | ||||
| @ -714,6 +735,16 @@ steps: | ||||
|     - pytest -v -s models/multimodal -m core_model --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/processing | ||||
|     - cd .. && VLLM_WORKER_MULTIPROC_METHOD=spawn pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model  # Otherwise, mp_method="spawn" doesn't work | ||||
|  | ||||
| - label: Multi-Modal Accuracy Eval (Small Models) # 50min | ||||
|   timeout_in_minutes: 70 | ||||
|   working_dir: "/vllm-workspace/.buildkite/lm-eval-harness" | ||||
|   source_file_dependencies: | ||||
|   - vllm/multimodal/ | ||||
|   - vllm/inputs/ | ||||
|   - vllm/v1/core/ | ||||
|   commands: | ||||
|   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-mm-small.txt --tp-size=1 | ||||
|  | ||||
| - label: Multi-Modal Models Test (Extended) 1 | ||||
|   mirror_hardwares: [amdexperimental] | ||||
|   optional: true | ||||
| @ -769,6 +800,7 @@ steps: | ||||
|   commands: | ||||
|     - pip install --upgrade git+https://github.com/huggingface/transformers | ||||
|     - pytest -v -s tests/models/test_initialization.py | ||||
|     - pytest -v -s tests/models/test_transformers.py | ||||
|     - pytest -v -s tests/models/multimodal/processing/ | ||||
|     - pytest -v -s tests/models/multimodal/test_mapping.py | ||||
|     - python3 examples/offline_inference/basic/chat.py | ||||
| @ -776,8 +808,8 @@ steps: | ||||
|     # Whisper needs spawn method to avoid deadlock | ||||
|     - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper | ||||
|  | ||||
| - label: Blackwell Test # 38 min | ||||
|   timeout_in_minutes: 60 | ||||
| - label: Blackwell Test # 21 min | ||||
|   timeout_in_minutes: 30 | ||||
|   working_dir: "/vllm-workspace/" | ||||
|   gpu: b200 | ||||
|   # optional: true | ||||
| @ -790,8 +822,6 @@ steps: | ||||
|   - vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py | ||||
|   - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py | ||||
|   - vllm/v1/attention/backends/flashinfer.py | ||||
|   - vllm/compilation/fusion.py | ||||
|   - vllm/compilation/fusion_attn.py | ||||
|   commands: | ||||
|     - nvidia-smi | ||||
|     - python3 examples/offline_inference/basic/chat.py | ||||
| @ -808,19 +838,38 @@ steps: | ||||
|     - pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py | ||||
|     - pytest -v -s tests/kernels/quantization/test_flashinfer_scaled_mm.py | ||||
|     - pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py | ||||
|     - pytest -v -s tests/kernels/quantization/test_nvfp4_qutlass.py | ||||
|     - pytest -v -s tests/kernels/quantization/test_mxfp4_qutlass.py | ||||
|     - pytest -v -s tests/kernels/moe/test_nvfp4_moe.py | ||||
|     - pytest -v -s tests/kernels/moe/test_mxfp4_moe.py | ||||
|     # Fusion | ||||
|     - pytest -v -s tests/compile/test_fusion_all_reduce.py | ||||
|     - pytest -v -s tests/compile/test_fusion_attn.py::test_attention_quant_pattern | ||||
|     - pytest -v -s tests/kernels/moe/test_ocp_mx_moe.py | ||||
|     - pytest -v -s tests/kernels/moe/test_flashinfer.py | ||||
|     - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py | ||||
|  | ||||
| - label: GPT-OSS Eval (Blackwell) | ||||
| - label: Blackwell Fusion Tests # 30 min | ||||
|   timeout_in_minutes: 40 | ||||
|   working_dir: "/vllm-workspace/" | ||||
|   gpu: b200 | ||||
|   source_file_dependencies: | ||||
|   - csrc/quantization/fp4/ | ||||
|   - vllm/model_executor/layers/quantization/utils/flashinfer_utils.py | ||||
|   - vllm/v1/attention/backends/flashinfer.py | ||||
|   - vllm/compilation/ | ||||
|   # can affect pattern matching | ||||
|   - vllm/model_executor/layers/layernorm.py | ||||
|   - vllm/model_executor/layers/activation.py | ||||
|   - vllm/model_executor/layers/quantization/input_quant_fp8.py | ||||
|   commands: | ||||
|     - nvidia-smi | ||||
|     - pytest -v -s tests/compile/test_fusion_attn.py | ||||
|     - pytest -v -s tests/compile/test_silu_mul_quant_fusion.py | ||||
|     # this runner has 2 GPUs available even though num_gpus=2 is not set | ||||
|     - pytest -v -s tests/compile/test_fusion_all_reduce.py | ||||
|     - pytest -v -s tests/compile/test_fusions_e2e.py | ||||
|  | ||||
| - label: Blackwell GPT-OSS Eval | ||||
|   timeout_in_minutes: 60 | ||||
|   working_dir: "/vllm-workspace/" | ||||
|   gpu: b200 | ||||
|   optional: true # disable while debugging | ||||
|   optional: true # run on nightlies | ||||
|   source_file_dependencies: | ||||
|   - tests/evals/gpt_oss | ||||
|   - vllm/model_executor/models/gpt_oss.py | ||||
| @ -828,7 +877,34 @@ steps: | ||||
|   - vllm/v1/attention/backends/flashinfer.py | ||||
|   commands: | ||||
|     - uv pip install --system 'gpt-oss[eval]==0.0.5' | ||||
|     - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 --server-args '--tensor-parallel-size 2' | ||||
|     - pytest -s -v tests/evals/gpt_oss/test_gpqa_correctness.py --model openai/gpt-oss-20b --metric 0.58 | ||||
|  | ||||
| - label: Blackwell Quantized MoE Test | ||||
|   timeout_in_minutes: 60 | ||||
|   working_dir: "/vllm-workspace/" | ||||
|   gpu: b200 | ||||
|   source_file_dependencies: | ||||
|   - tests/quantization/test_blackwell_moe.py | ||||
|   - vllm/model_executor/models/deepseek_v2.py | ||||
|   - vllm/model_executor/models/gpt_oss.py | ||||
|   - vllm/model_executor/models/llama4.py | ||||
|   - vllm/model_executor/layers/fused_moe | ||||
|   - vllm/model_executor/layers/quantization/compressed_tensors | ||||
|   - vllm/model_executor/layers/quantization/modelopt.py | ||||
|   - vllm/model_executor/layers/quantization/mxfp4.py | ||||
|   - vllm/v1/attention/backends/flashinfer.py | ||||
|   commands: | ||||
|     - pytest -s -v tests/quantization/test_blackwell_moe.py | ||||
|  | ||||
| - label: Blackwell LM Eval Small Models | ||||
|   timeout_in_minutes: 120 | ||||
|   gpu: b200 | ||||
|   optional: true # run on nightlies | ||||
|   source_file_dependencies: | ||||
|   - csrc/ | ||||
|   - vllm/model_executor/layers/quantization | ||||
|   commands: | ||||
|   - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-blackwell.txt --tp-size=1 | ||||
|  | ||||
| #####  1 GPU test  ##### | ||||
| #####  multi gpus test  ##### | ||||
| @ -889,14 +965,13 @@ steps: | ||||
|   - tests/compile/test_wrapper.py | ||||
|   - tests/distributed/ | ||||
|   - tests/entrypoints/llm/test_collective_rpc.py | ||||
|   - tests/v1/test_async_llm_dp.py | ||||
|   - tests/v1/test_external_lb_dp.py | ||||
|   - tests/v1/distributed | ||||
|   - tests/v1/entrypoints/openai/test_multi_api_servers.py | ||||
|   - tests/v1/shutdown | ||||
|   - tests/v1/worker/test_worker_memory_snapshot.py | ||||
|   commands: | ||||
|   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py | ||||
|   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_external_lb_dp.py | ||||
|   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_async_llm_dp.py | ||||
|   - TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/distributed/test_external_lb_dp.py | ||||
|   - DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py | ||||
|   - pytest -v -s entrypoints/llm/test_collective_rpc.py | ||||
|   - pytest -v -s ./compile/test_basic_correctness.py | ||||
| @ -945,6 +1020,11 @@ steps: | ||||
|   - pytest -v -s plugins_tests/test_io_processor_plugins.py | ||||
|   - pip uninstall prithvi_io_processor_plugin -y | ||||
|   # end io_processor plugins test | ||||
|   # begin stat_logger plugins test | ||||
|   - pip install -e ./plugins/vllm_add_dummy_stat_logger | ||||
|   - pytest -v -s plugins_tests/test_stats_logger_plugins.py | ||||
|   - pip uninstall dummy_stat_logger -y | ||||
|   # end stat_logger plugins test | ||||
|   # other tests continue here: | ||||
|   - pytest -v -s plugins_tests/test_scheduler_plugins.py | ||||
|   - pip install -e ./plugins/vllm_add_dummy_model | ||||
| @ -1009,6 +1089,17 @@ steps: | ||||
|   - tests/weight_loading | ||||
|   commands: | ||||
|     - bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt | ||||
|    | ||||
| - label: NixlConnector PD accuracy tests (Distributed) # 30min | ||||
|   timeout_in_minutes: 30 | ||||
|   working_dir: "/vllm-workspace/tests" | ||||
|   num_gpus: 4 | ||||
|   source_file_dependencies: | ||||
|     - vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py | ||||
|     - tests/v1/kv_connector/nixl_integration/ | ||||
|   commands: | ||||
|     - uv pip install --system -r /vllm-workspace/requirements/kv_connectors.txt | ||||
|     - bash v1/kv_connector/nixl_integration/tp_config_sweep_accuracy_test.sh | ||||
|  | ||||
|  | ||||
| ##### multi gpus test ##### | ||||
| @ -1041,12 +1132,16 @@ steps: | ||||
|   - pytest -s -v test_lm_eval_correctness.py --config-list-file=configs/models-large.txt --tp-size=4 | ||||
|  | ||||
| ##### H200 test ##### | ||||
| - label: Distrubted Tests (H200) # optional | ||||
| - label: Distributed Tests (H200) # optional | ||||
|   gpu: h200 | ||||
|   optional: true | ||||
|   working_dir: "/vllm-workspace/" | ||||
|   num_gpus: 2 | ||||
|   commands: | ||||
|     - pytest -v -s tests/compile/test_async_tp.py | ||||
|     - pytest -v -s tests/compile/test_sequence_parallelism.py | ||||
|     - pytest -v -s tests/compile/test_fusion_all_reduce.py | ||||
|     - pytest -v -s tests/compile/test_fusions_e2e.py::test_tp2_attn_quant_allreduce_rmsnorm | ||||
|     - pytest -v -s tests/distributed/test_context_parallel.py | ||||
|     - CUDA_VISIBLE_DEVICES=1,2 VLLM_ALL2ALL_BACKEND=deepep_high_throughput VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model Qwen/Qwen1.5-MoE-A2.7B --tp-size=1  --dp-size=2 --max-model-len 2048 | ||||
|  | ||||
|  | ||||
							
								
								
									
										17
									
								
								.coveragerc
									
									
									
									
									
								
							
							
						
						
									
										17
									
								
								.coveragerc
									
									
									
									
									
								
							| @ -1,5 +1,10 @@ | ||||
| [run] | ||||
| source = vllm | ||||
| # Track the installed vllm package (this is what actually gets imported during tests) | ||||
| # Use wildcard pattern to match the installed location | ||||
| source = | ||||
|     vllm | ||||
|     */dist-packages/vllm | ||||
|     */site-packages/vllm | ||||
| omit = | ||||
|     */tests/* | ||||
|     */test_* | ||||
| @ -12,6 +17,16 @@ omit = | ||||
|     */benchmarks/* | ||||
|     */docs/* | ||||
|  | ||||
| [paths] | ||||
| # Map all possible vllm locations to a canonical "vllm" path | ||||
| # This ensures coverage.combine properly merges data from different test runs | ||||
| source = | ||||
|     vllm | ||||
|     /vllm-workspace/src/vllm | ||||
|     /vllm-workspace/vllm | ||||
|     */site-packages/vllm | ||||
|     */dist-packages/vllm | ||||
|  | ||||
| [report] | ||||
| exclude_lines = | ||||
|     pragma: no cover | ||||
|  | ||||
							
								
								
									
										4
									
								
								.git-blame-ignore-revs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4
									
								
								.git-blame-ignore-revs
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,4 @@ | ||||
| # Migrate from `yapf` & `isort` to `ruff` | ||||
| d6953beb91da4e9c99be4c0a1304a2d24189535c | ||||
| # Convert `Optional[x]` to `x | None` and `Union[x, y]` to `x | y` | ||||
| 8fcaaf6a165e661f63fc51be906bc05b0767332f | ||||
							
								
								
									
										24
									
								
								.github/CODEOWNERS
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										24
									
								
								.github/CODEOWNERS
									
									
									
									
										vendored
									
									
								
							| @ -5,15 +5,11 @@ | ||||
| /vllm/attention @LucasWilkinson | ||||
| /vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill | ||||
| /vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn | ||||
| /vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @22quinn | ||||
| /vllm/model_executor/layers/fused_moe @mgoin | ||||
| /vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill @NickLucche | ||||
| /vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256 | ||||
| /vllm/model_executor/layers/mamba @tdoublep | ||||
| /vllm/model_executor/model_loader @22quinn | ||||
| /vllm/multimodal @DarkLight1337 @ywang96 @NickLucche | ||||
| /vllm/v1/attention @LucasWilkinson | ||||
| /vllm/v1/sample @22quinn @houseroad | ||||
| /vllm/vllm_flash_attn @LucasWilkinson | ||||
| /vllm/lora @jeejeelee | ||||
| /vllm/reasoning @aarnphm @chaunceyjiang | ||||
| @ -25,14 +21,16 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson | ||||
| # Any change to the VllmConfig changes can have a large user-facing impact, | ||||
| # so spam a lot of people | ||||
| /vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg | ||||
| /vllm/config/cache.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg @heheda12345 | ||||
|  | ||||
| # vLLM V1 | ||||
| /vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat | ||||
| /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett | ||||
| /vllm/v1/spec_decode @benchislett @luccafong | ||||
| /vllm/v1/attention @LucasWilkinson | ||||
| /vllm/v1/attention/backends/flashinfer.py @mgoin | ||||
| /vllm/v1/attention/backends/triton_attn.py @tdoublep | ||||
| /vllm/v1/core @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat @heheda12345 @ApostaC | ||||
| /vllm/v1/sample @22quinn @houseroad @njhill | ||||
| /vllm/v1/spec_decode @benchislett @luccafong | ||||
| /vllm/v1/structured_output @mgoin @russellb @aarnphm @benchislett | ||||
| /vllm/v1/kv_cache_interface.py @heheda12345 | ||||
| /vllm/v1/offloading @ApostaC | ||||
|  | ||||
| @ -54,12 +52,12 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson | ||||
| /tests/weight_loading @mgoin @youkaichao @yewentao256 | ||||
| /tests/lora @jeejeelee | ||||
| /tests/models/language/generation/test_hybrid.py @tdoublep | ||||
| /tests/v1/kv_connector/nixl_integration @NickLucche  | ||||
| /tests/v1/kv_connector/nixl_integration @NickLucche | ||||
| /tests/v1/kv_connector @ApostaC | ||||
| /tests/v1/offloading @ApostaC | ||||
|  | ||||
| # Transformers backend | ||||
| /vllm/model_executor/models/transformers.py @hmellor | ||||
| /vllm/model_executor/models/transformers @hmellor | ||||
| /tests/models/test_transformers.py @hmellor | ||||
|  | ||||
| # Docs | ||||
| @ -120,3 +118,11 @@ mkdocs.yaml @hmellor | ||||
|  | ||||
| # KVConnector installation files | ||||
| /requirements/kv_connectors.txt @NickLucche | ||||
|  | ||||
| # Pooling models | ||||
| /examples/*/pooling/ @noooop | ||||
| /tests/models/*/pooling* @noooop | ||||
| /tests/entrypoints/pooling @noooop | ||||
| /vllm/config/pooler.py @noooop | ||||
| /vllm/pooling_params.py @noooop | ||||
| /vllm/model_executor/layers/pooler.py @noooop | ||||
|  | ||||
							
								
								
									
										35
									
								
								.github/mergify.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										35
									
								
								.github/mergify.yml
									
									
									
									
										vendored
									
									
								
							| @ -2,6 +2,7 @@ pull_request_rules: | ||||
| - name: label-documentation | ||||
|   description: Automatically apply documentation label | ||||
|   conditions: | ||||
|     - label != stale | ||||
|     - or: | ||||
|       - files~=^[^/]+\.md$ | ||||
|       - files~=^docs/ | ||||
| @ -10,10 +11,13 @@ pull_request_rules: | ||||
|     label: | ||||
|       add: | ||||
|         - documentation | ||||
|     comment: | ||||
|       message: "Documentation preview: https://vllm--{{number}}.org.readthedocs.build/en/{{number}}/" | ||||
|  | ||||
| - name: label-ci-build | ||||
|   description: Automatically apply ci/build label | ||||
|   conditions: | ||||
|     - label != stale | ||||
|     - or: | ||||
|       - files~=^\.github/ | ||||
|       - files~=\.buildkite/ | ||||
| @ -30,6 +34,7 @@ pull_request_rules: | ||||
| - name: label-deepseek | ||||
|   description: Automatically apply deepseek label | ||||
|   conditions: | ||||
|     - label != stale | ||||
|     - or: | ||||
|       - files~=^examples/.*deepseek.*\.py | ||||
|       - files~=^tests/.*deepseek.*\.py | ||||
| @ -46,6 +51,7 @@ pull_request_rules: | ||||
| - name: label-frontend | ||||
|   description: Automatically apply frontend label | ||||
|   conditions: | ||||
|     - label != stale | ||||
|     - files~=^vllm/entrypoints/ | ||||
|   actions: | ||||
|     label: | ||||
| @ -55,6 +61,7 @@ pull_request_rules: | ||||
| - name: label-llama | ||||
|   description: Automatically apply llama label | ||||
|   conditions: | ||||
|     - label != stale | ||||
|     - or: | ||||
|       - files~=^examples/.*llama.*\.py | ||||
|       - files~=^tests/.*llama.*\.py | ||||
| @ -70,6 +77,7 @@ pull_request_rules: | ||||
| - name: label-multi-modality | ||||
|   description: Automatically apply multi-modality label | ||||
|   conditions: | ||||
|     - label != stale | ||||
|     - or: | ||||
|       - files~=^vllm/multimodal/ | ||||
|       - files~=^tests/multimodal/ | ||||
| @ -83,6 +91,7 @@ pull_request_rules: | ||||
| - name: label-new-model | ||||
|   description: Automatically apply new-model label | ||||
|   conditions: | ||||
|     - label != stale | ||||
|     - and: | ||||
|       - files~=^vllm/model_executor/models/ | ||||
|       - files=vllm/model_executor/models/registry.py | ||||
| @ -94,6 +103,7 @@ pull_request_rules: | ||||
| - name: label-performance | ||||
|   description: Automatically apply performance label | ||||
|   conditions: | ||||
|     - label != stale | ||||
|     - or: | ||||
|       - files~=^benchmarks/ | ||||
|       - files~=^vllm/benchmarks/ | ||||
| @ -107,6 +117,7 @@ pull_request_rules: | ||||
| - name: label-qwen | ||||
|   description: Automatically apply qwen label | ||||
|   conditions: | ||||
|     - label != stale | ||||
|     - or: | ||||
|       - files~=^examples/.*qwen.*\.py | ||||
|       - files~=^tests/.*qwen.*\.py | ||||
| @ -121,6 +132,7 @@ pull_request_rules: | ||||
| - name: label-gpt-oss | ||||
|   description: Automatically apply gpt-oss label | ||||
|   conditions: | ||||
|     - label != stale | ||||
|     - or: | ||||
|       - files~=^examples/.*gpt[-_]?oss.*\.py | ||||
|       - files~=^tests/.*gpt[-_]?oss.*\.py | ||||
| @ -142,6 +154,7 @@ pull_request_rules: | ||||
| - name: label-rocm | ||||
|   description: Automatically apply rocm label | ||||
|   conditions: | ||||
|     - label != stale | ||||
|     - or: | ||||
|       - files~=^csrc/rocm/ | ||||
|       - files~=^docker/Dockerfile.rocm | ||||
| @ -162,6 +175,7 @@ pull_request_rules: | ||||
| - name: label-structured-output | ||||
|   description: Automatically apply structured-output label | ||||
|   conditions: | ||||
|     - label != stale | ||||
|     - or: | ||||
|       - files~=^benchmarks/structured_schemas/ | ||||
|       - files=benchmarks/benchmark_serving_structured_output.py | ||||
| @ -181,6 +195,7 @@ pull_request_rules: | ||||
| - name: label-speculative-decoding | ||||
|   description: Automatically apply speculative-decoding label | ||||
|   conditions: | ||||
|     - label != stale | ||||
|     - or: | ||||
|       - files~=^vllm/v1/spec_decode/ | ||||
|       - files~=^tests/v1/spec_decode/ | ||||
| @ -196,6 +211,7 @@ pull_request_rules: | ||||
| - name: label-v1 | ||||
|   description: Automatically apply v1 label | ||||
|   conditions: | ||||
|     - label != stale | ||||
|     - or: | ||||
|       - files~=^vllm/v1/ | ||||
|       - files~=^tests/v1/ | ||||
| @ -208,6 +224,7 @@ pull_request_rules: | ||||
|   description: Automatically apply tpu label | ||||
|   # Keep this list in sync with `label-tpu-remove` conditions | ||||
|   conditions: | ||||
|     - label != stale | ||||
|     - or: | ||||
|       - files~=tpu.py | ||||
|       - files~=_tpu | ||||
| @ -223,6 +240,7 @@ pull_request_rules: | ||||
|   description: Automatically remove tpu label | ||||
|   # Keep this list in sync with `label-tpu` conditions | ||||
|   conditions: | ||||
|     - label != stale | ||||
|     - and: | ||||
|       - -files~=tpu.py | ||||
|       - -files~=_tpu | ||||
| @ -237,9 +255,9 @@ pull_request_rules: | ||||
| - name: label-tool-calling | ||||
|   description: Automatically add tool-calling label | ||||
|   conditions: | ||||
|     - label != stale | ||||
|     - or: | ||||
|       - files~=^tests/tool_use/ | ||||
|       - files~=^tests/mistral_tool_use/ | ||||
|       - files~=^tests/entrypoints/openai/tool_parsers/ | ||||
|       - files=tests/entrypoints/openai/test_chat_with_tool_reasoning.py | ||||
|       - files~=^vllm/entrypoints/openai/tool_parsers/ | ||||
| @ -256,8 +274,9 @@ pull_request_rules: | ||||
|  | ||||
| - name: ping author on conflicts and add 'needs-rebase' label | ||||
|   conditions: | ||||
|       - conflict | ||||
|       - -closed | ||||
|     - label != stale | ||||
|     - conflict | ||||
|     - -closed | ||||
|   actions: | ||||
|     label: | ||||
|       add: | ||||
| @ -271,10 +290,12 @@ pull_request_rules: | ||||
|  | ||||
| - name: assign reviewer for tensorizer changes | ||||
|   conditions: | ||||
|     - label != stale | ||||
|     - or: | ||||
|       - files~=^vllm/model_executor/model_loader/tensorizer.py | ||||
|       - files~=^vllm/model_executor/model_loader/tensorizer_loader.py | ||||
|       - files~=^tests/entrypoints/openai/test_tensorizer_entrypoint.py | ||||
|       - files~=^tests/tensorizer_loader/ | ||||
|       - files~=^tests/model_executor/model_loader/tensorizer_loader/ | ||||
|   actions: | ||||
|     assign: | ||||
|       users: | ||||
| @ -282,6 +303,7 @@ pull_request_rules: | ||||
|  | ||||
| - name: assign reviewer for modelopt changes | ||||
|   conditions: | ||||
|     - label != stale | ||||
|     - or: | ||||
|         - files~=^vllm/model_executor/layers/quantization/modelopt\.py$ | ||||
|         - files~=^vllm/model_executor/layers/quantization/__init__\.py$ | ||||
| @ -296,8 +318,8 @@ pull_request_rules: | ||||
|  | ||||
| - name: remove 'needs-rebase' label when conflict is resolved | ||||
|   conditions: | ||||
|       - -conflict | ||||
|       - -closed | ||||
|     - -conflict | ||||
|     - -closed | ||||
|   actions: | ||||
|     label: | ||||
|       remove: | ||||
| @ -306,6 +328,7 @@ pull_request_rules: | ||||
| - name: label-kv-connector | ||||
|   description: Automatically apply kv-connector label | ||||
|   conditions: | ||||
|     - label != stale | ||||
|     - or: | ||||
|       - files~=^examples/online_serving/disaggregated[^/]*/.* | ||||
|       - files~=^examples/offline_inference/disaggregated[^/]*/.* | ||||
|  | ||||
							
								
								
									
										138
									
								
								.github/workflows/issue_autolabel.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										138
									
								
								.github/workflows/issue_autolabel.yml
									
									
									
									
										vendored
									
									
								
							| @ -13,6 +13,7 @@ jobs: | ||||
|     runs-on: ubuntu-latest | ||||
|     steps: | ||||
|       - name: Label issues based on keywords | ||||
|         id: label-step | ||||
|         uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd  # v8.0.0 | ||||
|         with: | ||||
|           script: | | ||||
| @ -42,7 +43,6 @@ jobs: | ||||
|                     searchIn: "body" | ||||
|                   }, | ||||
|                 ], | ||||
|                  | ||||
|                 // Substring search - matches anywhere in text (partial matches) | ||||
|                 substrings: [ | ||||
|                   { | ||||
| @ -89,14 +89,12 @@ jobs: | ||||
|                     term: "hip_", | ||||
|                     searchIn: "both" | ||||
|                   }, | ||||
|                    | ||||
|                   // ROCm tools and libraries | ||||
|                   { | ||||
|                     term: "hipify", | ||||
|                     searchIn: "both" | ||||
|                   }, | ||||
|                 ], | ||||
|                  | ||||
|                 // Regex patterns - for complex pattern matching | ||||
|                 regexPatterns: [ | ||||
|                   { | ||||
| @ -107,13 +105,17 @@ jobs: | ||||
|                   } | ||||
|                 ], | ||||
|               }, | ||||
|               // Add more label configurations here as needed | ||||
|               // example: { | ||||
|               //   keywords: [...], | ||||
|               //   substrings: [...], | ||||
|               //   regexPatterns: [...] | ||||
|               // }, | ||||
|             }; | ||||
|              | ||||
|             // Helper function to create regex based on search type | ||||
|             function createSearchRegex(term, type) { | ||||
|               // Escape special regex characters in the term | ||||
|               const escapedTerm = term.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); | ||||
|                | ||||
|               switch (type) { | ||||
|                 case 'keyword': | ||||
|                   // Word boundary search - matches whole words only | ||||
| @ -125,16 +127,13 @@ jobs: | ||||
|                   throw new Error(`Unknown search type: ${type}`); | ||||
|               } | ||||
|             } | ||||
|              | ||||
|             // Helper function to find matching terms in text with line information | ||||
|             function findMatchingTermsWithLines(text, searchTerms = [], searchType = 'keyword', searchLocation = '') { | ||||
|               const matches = []; | ||||
|               const lines = text.split('\n'); | ||||
|                | ||||
|               for (const termConfig of searchTerms) { | ||||
|                 let regex; | ||||
|                 let term, searchIn, pattern, description, flags; | ||||
|                  | ||||
|                 // Handle different input formats (string or object) | ||||
|                 if (typeof termConfig === 'string') { | ||||
|                   term = termConfig; | ||||
| @ -146,21 +145,17 @@ jobs: | ||||
|                   description = termConfig.description; | ||||
|                   flags = termConfig.flags; | ||||
|                 } | ||||
|                  | ||||
|                 // Skip if this term shouldn't be searched in the current location | ||||
|                 if (searchIn !== 'both' && searchIn !== searchLocation) { | ||||
|                   continue; | ||||
|                 } | ||||
|                  | ||||
|                 // Create appropriate regex | ||||
|                 if (searchType === 'regex') { | ||||
|                   regex = new RegExp(pattern, flags || "gi"); | ||||
|                 } else { | ||||
|                   regex = createSearchRegex(term, searchType); | ||||
|                 } | ||||
|                  | ||||
|                 const termMatches = []; | ||||
|                  | ||||
|                 // Check each line for matches | ||||
|                 lines.forEach((line, lineIndex) => { | ||||
|                   const lineMatches = line.match(regex); | ||||
| @ -175,15 +170,14 @@ jobs: | ||||
|                         originalTerm: term || pattern, | ||||
|                         description: description, | ||||
|                         // Show context around the match in the line | ||||
|                         context: line.length > 100 ?  | ||||
|                           line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30),  | ||||
|                                        line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...'  | ||||
|                         context: line.length > 100 ? | ||||
|                           line.substring(Math.max(0, line.toLowerCase().indexOf(match.toLowerCase()) - 30), | ||||
|                                        line.toLowerCase().indexOf(match.toLowerCase()) + match.length + 30) + '...' | ||||
|                           : line.trim() | ||||
|                       }); | ||||
|                     }); | ||||
|                   } | ||||
|                 }); | ||||
|                  | ||||
|                 if (termMatches.length > 0) { | ||||
|                   matches.push({ | ||||
|                     term: term || (description || pattern), | ||||
| @ -196,64 +190,48 @@ jobs: | ||||
|                   }); | ||||
|                 } | ||||
|               } | ||||
|                | ||||
|               return matches; | ||||
|             } | ||||
|              | ||||
|             // Helper function to check if label should be added | ||||
|             async function processLabel(labelName, config) { | ||||
|               const body = context.payload.issue.body || ""; | ||||
|               const title = context.payload.issue.title || ""; | ||||
|                | ||||
|               core.notice(`Processing label: ${labelName}`); | ||||
|               core.notice(`Issue Title: "${title}"`); | ||||
|               core.notice(`Issue Body length: ${body.length} characters`); | ||||
|                | ||||
|               let shouldAddLabel = false; | ||||
|               let allMatches = []; | ||||
|               let reason = ''; | ||||
|                | ||||
|               const keywords = config.keywords || []; | ||||
|               const substrings = config.substrings || []; | ||||
|               const regexPatterns = config.regexPatterns || []; | ||||
|                | ||||
|               core.notice(`Searching with ${keywords.length} keywords, ${substrings.length} substrings, and ${regexPatterns.length} regex patterns`); | ||||
|                | ||||
|               // Search in title | ||||
|               if (title.trim()) { | ||||
|                 core.notice(`Searching in title: "${title}"`); | ||||
|                  | ||||
|                 const titleKeywordMatches = findMatchingTermsWithLines(title, keywords, 'keyword', 'title'); | ||||
|                 const titleSubstringMatches = findMatchingTermsWithLines(title, substrings, 'substring', 'title'); | ||||
|                 const titleRegexMatches = findMatchingTermsWithLines(title, regexPatterns, 'regex', 'title'); | ||||
|                  | ||||
|                 allMatches.push(...titleKeywordMatches, ...titleSubstringMatches, ...titleRegexMatches); | ||||
|               } | ||||
|                | ||||
|               // Search in body | ||||
|               if (body.trim()) { | ||||
|                 core.notice(`Searching in body (${body.length} characters)`); | ||||
|                  | ||||
|                 const bodyKeywordMatches = findMatchingTermsWithLines(body, keywords, 'keyword', 'body'); | ||||
|                 const bodySubstringMatches = findMatchingTermsWithLines(body, substrings, 'substring', 'body'); | ||||
|                 const bodyRegexMatches = findMatchingTermsWithLines(body, regexPatterns, 'regex', 'body'); | ||||
|                  | ||||
|                 allMatches.push(...bodyKeywordMatches, ...bodySubstringMatches, ...bodyRegexMatches); | ||||
|               } | ||||
|                | ||||
|               if (allMatches.length > 0) { | ||||
|                 core.notice(`Found ${allMatches.length} matching term(s):`); | ||||
|                  | ||||
|                 for (const termMatch of allMatches) { | ||||
|                   const locationText = termMatch.searchLocation === 'title' ? 'title' : 'body'; | ||||
|                   const searchInText = termMatch.searchIn === 'both' ? 'both' : termMatch.searchIn; | ||||
|                    | ||||
|                   if (termMatch.searchType === 'regex') { | ||||
|                     core.notice(`  📍 Regex: "${termMatch.term}" (pattern: ${termMatch.pattern}) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`); | ||||
|                   } else { | ||||
|                     core.notice(`  📍 Term: "${termMatch.term}" (${termMatch.searchType} search) found ${termMatch.count} time(s) in ${locationText} (configured to search in: ${searchInText}):`); | ||||
|                   } | ||||
|                    | ||||
|                   // Show details for each match | ||||
|                   termMatch.matches.forEach((match, index) => { | ||||
|                     core.notice(`    ${index + 1}. Line ${match.lineNumber} in ${match.searchLocation}: "${match.match}" [${match.searchType}]`); | ||||
| @ -266,7 +244,6 @@ jobs: | ||||
|                     } | ||||
|                   }); | ||||
|                 } | ||||
|                  | ||||
|                 shouldAddLabel = true; | ||||
|                 const totalMatches = allMatches.reduce((sum, t) => sum + t.count, 0); | ||||
|                 const titleMatches = allMatches.filter(t => t.searchLocation === 'title').reduce((sum, t) => sum + t.count, 0); | ||||
| @ -274,13 +251,10 @@ jobs: | ||||
|                 const keywordMatches = allMatches.filter(t => t.searchType === 'keyword').reduce((sum, t) => sum + t.count, 0); | ||||
|                 const substringMatches = allMatches.filter(t => t.searchType === 'substring').reduce((sum, t) => sum + t.count, 0); | ||||
|                 const regexMatches = allMatches.filter(t => t.searchType === 'regex').reduce((sum, t) => sum + t.count, 0); | ||||
|                  | ||||
|                 reason = `Found ${totalMatches} total matches (${titleMatches} in title, ${bodyMatches} in body) - ${keywordMatches} keyword matches, ${substringMatches} substring matches, ${regexMatches} regex matches`; | ||||
|               } | ||||
|                | ||||
|               core.notice(`Final decision: ${shouldAddLabel ? 'ADD LABEL' : 'DO NOT ADD LABEL'}`); | ||||
|               core.notice(`Reason: ${reason || 'No matching terms found'}`); | ||||
|                | ||||
|               if (shouldAddLabel) { | ||||
|                 const existingLabels = context.payload.issue.labels.map(l => l.name); | ||||
|                 if (!existingLabels.includes(labelName)) { | ||||
| @ -296,14 +270,92 @@ jobs: | ||||
|                 core.notice(`Label "${labelName}" already present.`); | ||||
|                 return false; | ||||
|               } | ||||
|                | ||||
|               core.notice(`No matching terms found for label "${labelName}".`); | ||||
|               return false; | ||||
|             } | ||||
|              | ||||
|             // Process all configured labels | ||||
|             const processLabels = Object.entries(labelConfig) | ||||
|               .map(([labelName, config]) => processLabel(labelName, config)); | ||||
|             const labelsAdded = await Promise.all(processLabels); | ||||
|             const numLabelsAdded = labelsAdded.reduce((x, y) => x + y, 0); | ||||
|             core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`); | ||||
|             const labelsAddedResults = await Promise.all( | ||||
|               Object.entries(labelConfig).map(([labelName, config]) =>  | ||||
|                 processLabel(labelName, config).then(added => ({ labelName, added })) | ||||
|               ) | ||||
|             ); | ||||
|              | ||||
|             const numLabelsAdded = labelsAddedResults.filter(r => r.added).length; | ||||
|             core.notice(`Processing complete. ${numLabelsAdded} label(s) added.`); | ||||
|              | ||||
|             // Return which labels were added for the next step | ||||
|             const addedLabels = labelsAddedResults.filter(r => r.added).map(r => r.labelName); | ||||
|             core.setOutput('labels_added', JSON.stringify(addedLabels)); | ||||
|             return addedLabels; | ||||
|  | ||||
|       - name: CC users for labeled issues | ||||
|         if: steps.label-step.outputs.labels_added != '[]' | ||||
|         uses: actions/github-script@ed597411d8f924073f98dfc5c65a23a2325f34cd  # v8.0.0 | ||||
|         with: | ||||
|           script: | | ||||
|             // Configuration: Map labels to GitHub users to CC | ||||
|             // You can add multiple users per label, and multiple label configurations | ||||
|             const ccConfig = { | ||||
|               rocm: { | ||||
|                 users: ['hongxiayang', 'tjtanaa', 'vllmellm'],  // Add more users as needed: ['user1', 'user2', 'user3'] | ||||
|                 message: 'CC {users} for ROCm-related issue'  // {users} will be replaced with @mentions | ||||
|               }, | ||||
|               // Add more label -> user mappings here | ||||
|               // Example: | ||||
|               // cuda: { | ||||
|               //   users: ['user1', 'user2'], | ||||
|               //   message: 'CC {users} for CUDA-related issue' | ||||
|               // }, | ||||
|               // performance: { | ||||
|               //   users: ['perfexpert'], | ||||
|               //   message: 'CC {users} for performance issue' | ||||
|               // }, | ||||
|             }; | ||||
|              | ||||
|             const labelsAdded = JSON.parse('${{ steps.label-step.outputs.labels_added }}'); | ||||
|             core.notice(`Labels added: ${labelsAdded.join(', ')}`); | ||||
|              | ||||
|             // Get existing comments to check for already mentioned users | ||||
|             const comments = await github.rest.issues.listComments({ | ||||
|               owner: context.repo.owner, | ||||
|               repo: context.repo.repo, | ||||
|               issue_number: context.issue.number, | ||||
|             }); | ||||
|              | ||||
|             const issueBody = context.payload.issue.body || ''; | ||||
|             const allExistingText = issueBody + '\n' + comments.data.map(c => c.body).join('\n'); | ||||
|              | ||||
|             // Process each label that was added | ||||
|             for (const label of labelsAdded) { | ||||
|               if (ccConfig[label]) { | ||||
|                 const config = ccConfig[label]; | ||||
|                 const usersToMention = []; | ||||
|                  | ||||
|                 // Check which users haven't been mentioned yet | ||||
|                 for (const user of config.users) { | ||||
|                   const mentionPattern = new RegExp(`@${user}\\b`, 'i'); | ||||
|                   if (!mentionPattern.test(allExistingText)) { | ||||
|                     usersToMention.push(user); | ||||
|                   } else { | ||||
|                     core.notice(`@${user} already mentioned for label "${label}", skipping`); | ||||
|                   } | ||||
|                 } | ||||
|                  | ||||
|                 // Post comment if there are users to mention | ||||
|                 if (usersToMention.length > 0) { | ||||
|                   const mentions = usersToMention.map(u => `@${u}`).join(' '); | ||||
|                   const message = config.message.replace('{users}', mentions); | ||||
|                    | ||||
|                   await github.rest.issues.createComment({ | ||||
|                     owner: context.repo.owner, | ||||
|                     repo: context.repo.repo, | ||||
|                     issue_number: context.issue.number, | ||||
|                     body: message | ||||
|                   }); | ||||
|                    | ||||
|                   core.notice(`CC comment added for label "${label}": ${mentions}`); | ||||
|                 } else { | ||||
|                   core.notice(`All users for label "${label}" already mentioned, skipping comment`); | ||||
|                 } | ||||
|               } | ||||
|             } | ||||
							
								
								
									
										2
									
								
								.github/workflows/stale.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										2
									
								
								.github/workflows/stale.yml
									
									
									
									
										vendored
									
									
								
							| @ -13,7 +13,7 @@ jobs: | ||||
|       actions: write | ||||
|     runs-on: ubuntu-latest | ||||
|     steps: | ||||
|       - uses: actions/stale@3a9db7e6a41a89f618792c92c0e97cc736e1b13f # v10.0.0 | ||||
|       - uses: actions/stale@5f858e3efba33a5ca4407a664cc011ad407f2008 # v10.1.0 | ||||
|         with: | ||||
|           # Increasing this value ensures that changes to this workflow | ||||
|           # propagate to all issues and PRs in days rather than months | ||||
|  | ||||
| @ -4,7 +4,6 @@ MD013: false | ||||
| MD024: | ||||
|   siblings_only: true | ||||
| MD033: false | ||||
| MD042: false | ||||
| MD045: false | ||||
| MD046: false | ||||
| MD051: false | ||||
|  | ||||
| @ -6,30 +6,19 @@ default_stages: | ||||
|   - manual # Run in CI | ||||
| exclude: 'vllm/third_party/.*' | ||||
| repos: | ||||
| - repo: https://github.com/google/yapf | ||||
|   rev: v0.43.0 | ||||
|   hooks: | ||||
|   - id: yapf | ||||
|     args: [--in-place, --verbose] | ||||
|     # Keep the same list from yapfignore here to avoid yapf failing without any inputs | ||||
|     exclude: '(.buildkite|benchmarks|build|examples)/.*' | ||||
| - repo: https://github.com/astral-sh/ruff-pre-commit | ||||
|   rev: v0.11.7 | ||||
|   rev: v0.14.0 | ||||
|   hooks: | ||||
|   - id: ruff | ||||
|   - id: ruff-check | ||||
|     args: [--output-format, github, --fix] | ||||
|   - id: ruff-format | ||||
|     files: ^(.buildkite|benchmarks|examples)/.* | ||||
| - repo: https://github.com/crate-ci/typos | ||||
|   rev: v1.35.5 | ||||
|   rev: v1.38.1 | ||||
|   hooks: | ||||
|   - id: typos | ||||
| - repo: https://github.com/PyCQA/isort | ||||
|   rev: 6.0.1 | ||||
|   hooks: | ||||
|   - id: isort | ||||
|     args: [--force-exclude] | ||||
| - repo: https://github.com/pre-commit/mirrors-clang-format | ||||
|   rev: v20.1.3 | ||||
|   rev: v21.1.2 | ||||
|   hooks: | ||||
|   - id: clang-format | ||||
|     exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*' | ||||
| @ -46,7 +35,7 @@ repos: | ||||
|   hooks: | ||||
|   - id: actionlint | ||||
| - repo: https://github.com/astral-sh/uv-pre-commit | ||||
|   rev: 0.6.17 | ||||
|   rev: 0.9.1 | ||||
|   hooks: | ||||
|     - id: pip-compile | ||||
|       args: [requirements/test.in, -o, requirements/test.txt, --index-strategy, unsafe-best-match, --torch-backend, cu128, --python-platform, x86_64-manylinux_2_28] | ||||
| @ -67,11 +56,6 @@ repos: | ||||
|       types_or: [python, pyi] | ||||
|       require_serial: true | ||||
|       additional_dependencies: [mypy==1.11.1, regex, types-cachetools, types-setuptools, types-PyYAML, types-requests, types-torch, pydantic] | ||||
|   - id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward | ||||
|     name: Run mypy for Python 3.9 | ||||
|     entry: python tools/pre_commit/mypy.py 1 "3.9" | ||||
|     <<: *mypy_common | ||||
|     stages: [manual] # Only run in CI | ||||
|   - id: mypy-3.10 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward | ||||
|     name: Run mypy for Python 3.10 | ||||
|     entry: python tools/pre_commit/mypy.py 1 "3.10" | ||||
| @ -87,6 +71,11 @@ repos: | ||||
|     entry: python tools/pre_commit/mypy.py 1 "3.12" | ||||
|     <<: *mypy_common | ||||
|     stages: [manual] # Only run in CI | ||||
|   - id: mypy-3.13 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward | ||||
|     name: Run mypy for Python 3.13 | ||||
|     entry: python tools/pre_commit/mypy.py 1 "3.13" | ||||
|     <<: *mypy_common | ||||
|     stages: [manual] # Only run in CI | ||||
|   - id: shellcheck | ||||
|     name: Lint shell scripts | ||||
|     entry: tools/shellcheck.sh | ||||
|  | ||||
							
								
								
									
										109
									
								
								CMakeLists.txt
									
									
									
									
									
								
							
							
						
						
									
										109
									
								
								CMakeLists.txt
									
									
									
									
									
								
							| @ -34,10 +34,10 @@ install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS) | ||||
| # Supported python versions.  These versions will be searched in order, the | ||||
| # first match will be selected.  These should be kept in sync with setup.py. | ||||
| # | ||||
| set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12" "3.13") | ||||
| set(PYTHON_SUPPORTED_VERSIONS "3.10" "3.11" "3.12" "3.13") | ||||
|  | ||||
| # Supported AMD GPU architectures. | ||||
| set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201") | ||||
| set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151") | ||||
|  | ||||
| # | ||||
| # Supported/expected torch versions for CUDA/ROCm. | ||||
| @ -86,6 +86,9 @@ find_package(Torch REQUIRED) | ||||
| # Supported NVIDIA architectures. | ||||
| # This check must happen after find_package(Torch) because that's when CMAKE_CUDA_COMPILER_VERSION gets defined | ||||
| if(DEFINED CMAKE_CUDA_COMPILER_VERSION AND | ||||
|    CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0) | ||||
|   set(CUDA_SUPPORTED_ARCHS "7.5;8.0;8.6;8.7;8.9;9.0;10.0;11.0;12.0") | ||||
| elseif(DEFINED CMAKE_CUDA_COMPILER_VERSION AND | ||||
|    CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8) | ||||
|   set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0") | ||||
| else() | ||||
| @ -175,6 +178,15 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA") | ||||
|   list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}") | ||||
| endif() | ||||
|  | ||||
| # | ||||
| # Set compression mode for CUDA >=13.x. | ||||
| # | ||||
| if(VLLM_GPU_LANG STREQUAL "CUDA" AND | ||||
|    DEFINED CMAKE_CUDA_COMPILER_VERSION AND | ||||
|    CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0) | ||||
|   list(APPEND VLLM_GPU_FLAGS "--compress-mode=size") | ||||
| endif() | ||||
|  | ||||
| # | ||||
| # Set CUDA include flags for CXX compiler. | ||||
| # | ||||
| @ -257,8 +269,8 @@ set(VLLM_EXT_SRC | ||||
|   "csrc/sampler.cu" | ||||
|   "csrc/cuda_view.cu" | ||||
|   "csrc/quantization/gptq/q_gemm.cu" | ||||
|   "csrc/quantization/compressed_tensors/int8_quant_kernels.cu" | ||||
|   "csrc/quantization/fp8/common.cu" | ||||
|   "csrc/quantization/w8a8/int8/scaled_quant.cu" | ||||
|   "csrc/quantization/w8a8/fp8/common.cu" | ||||
|   "csrc/quantization/fused_kernels/fused_layernorm_dynamic_per_token_quant.cu" | ||||
|   "csrc/quantization/gguf/gguf_kernel.cu" | ||||
|   "csrc/quantization/activation_kernels.cu" | ||||
| @ -270,7 +282,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") | ||||
|   SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") | ||||
|  | ||||
|   # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building. | ||||
|   set(CUTLASS_REVISION "v4.0.0" CACHE STRING "CUTLASS revision to use") | ||||
|   set(CUTLASS_REVISION "v4.2.1" CACHE STRING "CUTLASS revision to use") | ||||
|  | ||||
|   # Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided | ||||
|   if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR}) | ||||
| @ -302,13 +314,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") | ||||
|   list(APPEND VLLM_EXT_SRC | ||||
|     "csrc/quantization/awq/gemm_kernels.cu" | ||||
|     "csrc/permute_cols.cu" | ||||
|     "csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu" | ||||
|     "csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu" | ||||
|     "csrc/quantization/fp4/nvfp4_quant_entry.cu" | ||||
|     "csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu" | ||||
|     "csrc/quantization/fp4/nvfp4_blockwise_moe_kernel.cu" | ||||
|     "csrc/sparse/cutlass/sparse_scaled_mm_entry.cu" | ||||
|     "csrc/cutlass_extensions/common.cpp" | ||||
|     "csrc/quantization/fp8/per_token_group_quant.cu") | ||||
|     "csrc/quantization/w8a8/fp8/per_token_group_quant.cu" | ||||
|     "csrc/quantization/w8a8/int8/per_token_group_quant.cu") | ||||
|  | ||||
|   set_gencode_flags_for_srcs( | ||||
|     SRCS "${VLLM_EXT_SRC}" | ||||
| @ -412,11 +424,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") | ||||
|   cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}") | ||||
|   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0 AND SCALED_MM_ARCHS) | ||||
|     set(SRCS | ||||
|        "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu" | ||||
|        "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu" | ||||
|        "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu" | ||||
|        "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu" | ||||
|        "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu") | ||||
|        "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm90.cu" | ||||
|        "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_fp8.cu" | ||||
|        "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm90_int8.cu" | ||||
|        "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_azp_sm90_int8.cu" | ||||
|        "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm90_fp8.cu") | ||||
|     set_gencode_flags_for_srcs( | ||||
|       SRCS "${SRCS}" | ||||
|       CUDA_ARCHS "${SCALED_MM_ARCHS}") | ||||
| @ -440,12 +452,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") | ||||
|  | ||||
|   # The cutlass_scaled_mm kernels for Geforce Blackwell SM120 (c3x, i.e. CUTLASS 3.x) require | ||||
|   # CUDA 12.8 or later | ||||
|   cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0;12.0a" "${CUDA_ARCHS}") | ||||
|   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) | ||||
|     cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0f" "${CUDA_ARCHS}") | ||||
|   else() | ||||
|     cuda_archs_loose_intersection(SCALED_MM_ARCHS "12.0a" "${CUDA_ARCHS}") | ||||
|   endif() | ||||
|   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) | ||||
|     set(SRCS | ||||
|       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu" | ||||
|       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu" | ||||
|       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu" | ||||
|       "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm120.cu" | ||||
|       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm120_fp8.cu" | ||||
|       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm120_fp8.cu" | ||||
|     ) | ||||
|     set_gencode_flags_for_srcs( | ||||
|       SRCS "${SRCS}" | ||||
| @ -470,12 +486,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") | ||||
|  | ||||
|   # The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x) | ||||
|   # require CUDA 12.8 or later | ||||
|   cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}") | ||||
|   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) | ||||
|     cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}") | ||||
|   else() | ||||
|     cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}") | ||||
|   endif() | ||||
|   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) | ||||
|     set(SRCS | ||||
|       "csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu" | ||||
|       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu" | ||||
|       "csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm100_fp8.cu" | ||||
|       "csrc/quantization/w8a8/cutlass/scaled_mm_c3x_sm100.cu" | ||||
|       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_sm100_fp8.cu" | ||||
|       "csrc/quantization/w8a8/cutlass/c3x/scaled_mm_blockwise_sm100_fp8.cu" | ||||
|     ) | ||||
|     set_gencode_flags_for_srcs( | ||||
|       SRCS "${SRCS}" | ||||
| @ -506,7 +526,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") | ||||
|   # subtract out the archs that are already built for 3x | ||||
|   list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS}) | ||||
|   if (SCALED_MM_2X_ARCHS) | ||||
|     set(SRCS "csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu") | ||||
|     set(SRCS "csrc/quantization/w8a8/cutlass/scaled_mm_c2x.cu") | ||||
|     set_gencode_flags_for_srcs( | ||||
|       SRCS "${SRCS}" | ||||
|       CUDA_ARCHS "${SCALED_MM_2X_ARCHS}") | ||||
| @ -550,7 +570,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") | ||||
|  | ||||
|   # The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require | ||||
|   # CUDA 12.8 or later | ||||
|   cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS}") | ||||
|   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) | ||||
|     cuda_archs_loose_intersection(FP4_ARCHS "12.0f" "${CUDA_ARCHS}") | ||||
|   else() | ||||
|     cuda_archs_loose_intersection(FP4_ARCHS "12.0a" "${CUDA_ARCHS}") | ||||
|   endif() | ||||
|   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS) | ||||
|     set(SRCS | ||||
|       "csrc/quantization/fp4/nvfp4_quant_kernels.cu" | ||||
| @ -569,7 +593,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") | ||||
|   endif() | ||||
|  | ||||
|   # FP4 Archs and flags | ||||
|   cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}") | ||||
|   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) | ||||
|     cuda_archs_loose_intersection(FP4_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}") | ||||
|   else() | ||||
|     cuda_archs_loose_intersection(FP4_ARCHS "10.0a;10.1a;12.0a;12.1a" "${CUDA_ARCHS}") | ||||
|   endif() | ||||
|   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS) | ||||
|     set(SRCS | ||||
|       "csrc/quantization/fp4/nvfp4_quant_kernels.cu" | ||||
| @ -591,7 +619,11 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") | ||||
|   endif() | ||||
|  | ||||
|   # CUTLASS MLA Archs and flags | ||||
|   cuda_archs_loose_intersection(MLA_ARCHS "10.0a" "${CUDA_ARCHS}") | ||||
|   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) | ||||
|     cuda_archs_loose_intersection(MLA_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}") | ||||
|   else() | ||||
|     cuda_archs_loose_intersection(MLA_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}") | ||||
|   endif() | ||||
|   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND MLA_ARCHS) | ||||
|     set(SRCS | ||||
|       "csrc/attention/mla/sm100_cutlass_mla_kernel.cu") | ||||
| @ -617,7 +649,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") | ||||
|   # if it's possible to compile MoE kernels that use its output. | ||||
|   cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}") | ||||
|   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS) | ||||
|     set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm90.cu") | ||||
|     set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm90.cu") | ||||
|     set_gencode_flags_for_srcs( | ||||
|       SRCS "${SRCS}" | ||||
|       CUDA_ARCHS "${SCALED_MM_ARCHS}") | ||||
| @ -635,9 +667,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") | ||||
|     endif() | ||||
|   endif() | ||||
|  | ||||
|   cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}") | ||||
|   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) | ||||
|     cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f" "${CUDA_ARCHS}") | ||||
|   else() | ||||
|     cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}") | ||||
|   endif() | ||||
|   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) | ||||
|     set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x_sm100.cu") | ||||
|     set(SRCS "csrc/quantization/w8a8/cutlass/moe/grouped_mm_c3x_sm100.cu") | ||||
|     set_gencode_flags_for_srcs( | ||||
|       SRCS "${SRCS}" | ||||
|       CUDA_ARCHS "${SCALED_MM_ARCHS}") | ||||
| @ -656,9 +692,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") | ||||
|   endif() | ||||
|  | ||||
|   # moe_data.cu is used by all CUTLASS MoE kernels. | ||||
|   cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}") | ||||
|   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) | ||||
|     cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0f;11.0f;12.0f" "${CUDA_ARCHS}") | ||||
|   else() | ||||
|     cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}") | ||||
|   endif() | ||||
|   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS) | ||||
|     set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu") | ||||
|     set(SRCS "csrc/quantization/w8a8/cutlass/moe/moe_data.cu") | ||||
|     set_gencode_flags_for_srcs( | ||||
|       SRCS "${SRCS}" | ||||
|       CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}") | ||||
| @ -675,9 +715,13 @@ if(VLLM_GPU_LANG STREQUAL "CUDA") | ||||
|     endif() | ||||
|   endif() | ||||
|  | ||||
|   cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a" "${CUDA_ARCHS}") | ||||
|   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) | ||||
|     cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0f;11.0f;12.0f" "${CUDA_ARCHS}") | ||||
|   else() | ||||
|     cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;10.3a;12.0a;12.1a" "${CUDA_ARCHS}") | ||||
|   endif() | ||||
|   if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND SCALED_MM_ARCHS) | ||||
|     set(SRCS "csrc/quantization/cutlass_w8a8/moe/blockwise_scaled_group_mm_sm100.cu") | ||||
|     set(SRCS "csrc/quantization/w8a8/cutlass/moe/blockwise_scaled_group_mm_sm100.cu") | ||||
|     set_gencode_flags_for_srcs( | ||||
|       SRCS "${SRCS}" | ||||
|       CUDA_ARCHS "${SCALED_MM_ARCHS}") | ||||
| @ -963,6 +1007,7 @@ endif() | ||||
| # For CUDA we also build and ship some external projects. | ||||
| if (VLLM_GPU_LANG STREQUAL "CUDA") | ||||
|     include(cmake/external_projects/flashmla.cmake) | ||||
|     include(cmake/external_projects/qutlass.cmake) | ||||
|  | ||||
|     # vllm-flash-attn should be last as it overwrites some CMake functions | ||||
|     include(cmake/external_projects/vllm_flash_attn.cmake) | ||||
|  | ||||
| @ -21,6 +21,7 @@ Join us at the [PyTorch Conference, October 22-23](https://events.linuxfoundatio | ||||
|  | ||||
| *Latest News* 🔥 | ||||
|  | ||||
| - [2025/09] We hosted [vLLM Toronto Meetup](https://luma.com/e80e0ymm) focused on tackling inference at scale and speculative decoding with speakers from NVIDIA and Red Hat! Please find the meetup slides [here](https://docs.google.com/presentation/d/1IYJYmJcu9fLpID5N5RbW_vO0XLo0CGOR14IXOjB61V8/edit?usp=sharing). | ||||
| - [2025/08] We hosted [vLLM Shenzhen Meetup](https://mp.weixin.qq.com/s/k8ZBO1u2_2odgiKWH_GVTQ) focusing on the ecosystem around vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Ua2SVKVSu-wp5vou_6ElraDt2bnKhiEA). | ||||
| - [2025/08] We hosted [vLLM Singapore Meetup](https://www.sginnovate.com/event/vllm-sg-meet). We shared V1 updates, disaggregated serving and MLLM speedups with speakers from Embedded LLM, AMD, WekaIO, and A*STAR. Please find the meetup slides [here](https://drive.google.com/drive/folders/1ncf3GyqLdqFaB6IeB834E5TZJPLAOiXZ?usp=sharing). | ||||
| - [2025/08] We hosted [vLLM Shanghai Meetup](https://mp.weixin.qq.com/s/pDmAXHcN7Iqc8sUKgJgGtg) focusing on building, developing, and integrating with vLLM! Please find the meetup slides [here](https://drive.google.com/drive/folders/1OvLx39wnCGy_WKq8SiVKf7YcxxYI3WCH). | ||||
| @ -148,6 +149,7 @@ Compute Resources: | ||||
| - Trainy | ||||
| - UC Berkeley | ||||
| - UC San Diego | ||||
| - Volcengine | ||||
|  | ||||
| Slack Sponsor: Anyscale | ||||
|  | ||||
|  | ||||
| @ -74,7 +74,7 @@ start_server() { | ||||
|     local vllm_log=$4 | ||||
|     local profile_dir=$5 | ||||
|  | ||||
|     pkill -if vllm | ||||
|     pkill -if "vllm serve" || true | ||||
|  | ||||
|     # Define the common arguments as a bash array. | ||||
|     # Each argument and its value are separate elements. | ||||
| @ -96,11 +96,11 @@ start_server() { | ||||
|     # This correctly passes each element as a separate argument. | ||||
|     if [[ -n "$profile_dir" ]]; then | ||||
|         # Start server with profiling enabled | ||||
|         VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \ | ||||
|         VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \ | ||||
|             vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 & | ||||
|     else | ||||
|         # Start server without profiling | ||||
|         VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \ | ||||
|         VLLM_SERVER_DEV_MODE=1 \ | ||||
|             vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 & | ||||
|     fi | ||||
|     local server_pid=$! | ||||
| @ -139,7 +139,7 @@ run_benchmark() { | ||||
|     echo "vllm_log: $vllm_log" | ||||
|     echo | ||||
|     rm -f $vllm_log | ||||
|     pkill -if vllm | ||||
|     pkill -if "vllm serve" || true | ||||
|  | ||||
|     echo "starting server..." | ||||
|     # Call start_server without a profile_dir to avoid profiling overhead | ||||
| @ -232,7 +232,7 @@ run_benchmark() { | ||||
|  | ||||
|     echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" | ||||
|  | ||||
|     pkill -if vllm | ||||
|     pkill -if "vllm serve" || true | ||||
|     sleep 10 | ||||
|     echo "====================" | ||||
|     return 0 | ||||
| @ -308,6 +308,6 @@ if (( $(echo "$best_throughput > 0" | bc -l) )); then | ||||
| else | ||||
|     echo "No configuration met the latency requirements. Skipping final profiling run." | ||||
| fi | ||||
| pkill -if vllm | ||||
| pkill -if "vllm serve" || true | ||||
| echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" | ||||
| echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT" | ||||
|  | ||||
| @ -8,7 +8,6 @@ import sys | ||||
| import time | ||||
| import traceback | ||||
| from dataclasses import dataclass, field | ||||
| from typing import Optional, Union | ||||
|  | ||||
| import aiohttp | ||||
| import huggingface_hub.constants | ||||
| @ -28,13 +27,13 @@ class RequestFuncInput: | ||||
|     prompt_len: int | ||||
|     output_len: int | ||||
|     model: str | ||||
|     model_name: Optional[str] = None | ||||
|     logprobs: Optional[int] = None | ||||
|     extra_body: Optional[dict] = None | ||||
|     multi_modal_content: Optional[dict | list[dict]] = None | ||||
|     model_name: str | None = None | ||||
|     logprobs: int | None = None | ||||
|     extra_body: dict | None = None | ||||
|     multi_modal_content: dict | list[dict] | None = None | ||||
|     ignore_eos: bool = False | ||||
|     language: Optional[str] = None | ||||
|     request_id: Optional[str] = None | ||||
|     language: str | None = None | ||||
|     request_id: str | None = None | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| @ -52,7 +51,7 @@ class RequestFuncOutput: | ||||
|  | ||||
| async def async_request_tgi( | ||||
|     request_func_input: RequestFuncInput, | ||||
|     pbar: Optional[tqdm] = None, | ||||
|     pbar: tqdm | None = None, | ||||
| ) -> RequestFuncOutput: | ||||
|     api_url = request_func_input.api_url | ||||
|     assert api_url.endswith("generate_stream") | ||||
| @ -133,7 +132,7 @@ async def async_request_tgi( | ||||
|  | ||||
| async def async_request_trt_llm( | ||||
|     request_func_input: RequestFuncInput, | ||||
|     pbar: Optional[tqdm] = None, | ||||
|     pbar: tqdm | None = None, | ||||
| ) -> RequestFuncOutput: | ||||
|     api_url = request_func_input.api_url | ||||
|     assert api_url.endswith("generate_stream") | ||||
| @ -204,7 +203,7 @@ async def async_request_trt_llm( | ||||
|  | ||||
| async def async_request_deepspeed_mii( | ||||
|     request_func_input: RequestFuncInput, | ||||
|     pbar: Optional[tqdm] = None, | ||||
|     pbar: tqdm | None = None, | ||||
| ) -> RequestFuncOutput: | ||||
|     api_url = request_func_input.api_url | ||||
|     assert api_url.endswith(("completions", "profile")), ( | ||||
| @ -267,7 +266,7 @@ async def async_request_deepspeed_mii( | ||||
|  | ||||
| async def async_request_openai_completions( | ||||
|     request_func_input: RequestFuncInput, | ||||
|     pbar: Optional[tqdm] = None, | ||||
|     pbar: tqdm | None = None, | ||||
| ) -> RequestFuncOutput: | ||||
|     api_url = request_func_input.api_url | ||||
|     assert api_url.endswith(("completions", "profile")), ( | ||||
| @ -367,7 +366,7 @@ async def async_request_openai_completions( | ||||
|  | ||||
| async def async_request_openai_chat_completions( | ||||
|     request_func_input: RequestFuncInput, | ||||
|     pbar: Optional[tqdm] = None, | ||||
|     pbar: tqdm | None = None, | ||||
| ) -> RequestFuncOutput: | ||||
|     api_url = request_func_input.api_url | ||||
|     assert api_url.endswith(("chat/completions", "profile")), ( | ||||
| @ -476,7 +475,7 @@ async def async_request_openai_chat_completions( | ||||
|  | ||||
| async def async_request_openai_audio( | ||||
|     request_func_input: RequestFuncInput, | ||||
|     pbar: Optional[tqdm] = None, | ||||
|     pbar: tqdm | None = None, | ||||
| ) -> RequestFuncOutput: | ||||
|     # Lazy import without PlaceholderModule to avoid vllm dep. | ||||
|     import soundfile | ||||
| @ -610,7 +609,7 @@ def get_tokenizer( | ||||
|     tokenizer_mode: str = "auto", | ||||
|     trust_remote_code: bool = False, | ||||
|     **kwargs, | ||||
| ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: | ||||
| ) -> PreTrainedTokenizer | PreTrainedTokenizerFast: | ||||
|     if pretrained_model_name_or_path is not None and not os.path.exists( | ||||
|         pretrained_model_name_or_path | ||||
|     ): | ||||
|  | ||||
| @ -2,9 +2,9 @@ | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
| import gc | ||||
|  | ||||
| from benchmark_utils import TimeCollector | ||||
| from tabulate import tabulate | ||||
|  | ||||
| from benchmark_utils import TimeCollector | ||||
| from vllm.utils import FlexibleArgumentParser | ||||
| from vllm.v1.core.block_pool import BlockPool | ||||
|  | ||||
|  | ||||
| @ -5,9 +5,9 @@ import time | ||||
| from unittest import mock | ||||
|  | ||||
| import numpy as np | ||||
| from benchmark_utils import TimeCollector | ||||
| from tabulate import tabulate | ||||
|  | ||||
| from benchmark_utils import TimeCollector | ||||
| from vllm.config import ( | ||||
|     CacheConfig, | ||||
|     DeviceConfig, | ||||
| @ -164,7 +164,7 @@ def invoke_main() -> None: | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "--batched", action="store_true", help="consider time to prepare batch" | ||||
|     )  # noqa: E501 | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "--num-iteration", | ||||
|         type=int, | ||||
|  | ||||
| @ -32,7 +32,6 @@ import dataclasses | ||||
| import json | ||||
| import random | ||||
| import time | ||||
| from typing import Optional | ||||
|  | ||||
| from transformers import PreTrainedTokenizerBase | ||||
|  | ||||
| @ -80,7 +79,7 @@ def sample_requests_from_dataset( | ||||
|     num_requests: int, | ||||
|     tokenizer: PreTrainedTokenizerBase, | ||||
|     input_length_range: tuple[int, int], | ||||
|     fixed_output_len: Optional[int], | ||||
|     fixed_output_len: int | None, | ||||
| ) -> list[Request]: | ||||
|     if fixed_output_len is not None and fixed_output_len < 4: | ||||
|         raise ValueError("output_len too small") | ||||
| @ -128,7 +127,7 @@ def sample_requests_from_random( | ||||
|     num_requests: int, | ||||
|     tokenizer: PreTrainedTokenizerBase, | ||||
|     input_length_range: tuple[int, int], | ||||
|     fixed_output_len: Optional[int], | ||||
|     fixed_output_len: int | None, | ||||
|     prefix_len: int, | ||||
| ) -> list[Request]: | ||||
|     requests = [] | ||||
|  | ||||
| @ -7,7 +7,6 @@ import dataclasses | ||||
| import json | ||||
| import random | ||||
| import time | ||||
| from typing import Optional | ||||
|  | ||||
| from transformers import AutoTokenizer, PreTrainedTokenizerBase | ||||
|  | ||||
| @ -24,7 +23,7 @@ def sample_requests( | ||||
|     dataset_path: str, | ||||
|     num_requests: int, | ||||
|     tokenizer: PreTrainedTokenizerBase, | ||||
|     fixed_output_len: Optional[int], | ||||
|     fixed_output_len: int | None, | ||||
| ) -> list[tuple[str, int, int, int]]: | ||||
|     if fixed_output_len is not None and fixed_output_len < 4: | ||||
|         raise ValueError("output_len too small") | ||||
|  | ||||
| @ -31,20 +31,19 @@ import time | ||||
| import uuid | ||||
| import warnings | ||||
| from collections.abc import AsyncGenerator | ||||
| from contextlib import nullcontext | ||||
| from dataclasses import dataclass | ||||
| from typing import Optional | ||||
|  | ||||
| import datasets | ||||
| import numpy as np | ||||
| import pandas as pd | ||||
| from tqdm.asyncio import tqdm | ||||
| from transformers import PreTrainedTokenizerBase | ||||
|  | ||||
| from backend_request_func import ( | ||||
|     ASYNC_REQUEST_FUNCS, | ||||
|     RequestFuncInput, | ||||
|     RequestFuncOutput, | ||||
| ) | ||||
| from tqdm.asyncio import tqdm | ||||
| from transformers import PreTrainedTokenizerBase | ||||
|  | ||||
| try: | ||||
|     from vllm.transformers_utils.tokenizer import get_tokenizer | ||||
| @ -317,7 +316,7 @@ def calculate_metrics( | ||||
|     tokenizer: PreTrainedTokenizerBase, | ||||
|     selected_percentile_metrics: list[str], | ||||
|     selected_percentiles: list[float], | ||||
|     goodput_config_dict: Optional[dict[str, float]] = None, | ||||
|     goodput_config_dict: dict[str, float] | None = None, | ||||
| ) -> tuple[BenchmarkMetrics, list[int]]: | ||||
|     actual_output_lens: list[int] = [] | ||||
|     total_input = 0 | ||||
| @ -437,9 +436,9 @@ async def benchmark( | ||||
|     selected_percentile_metrics: list[str], | ||||
|     selected_percentiles: list[str], | ||||
|     ignore_eos: bool, | ||||
|     max_concurrency: Optional[int], | ||||
|     max_concurrency: int | None, | ||||
|     structured_output_ratio: float, | ||||
|     goodput_config_dict: Optional[dict[str, float]] = None, | ||||
|     goodput_config_dict: dict[str, float] | None = None, | ||||
| ): | ||||
|     if backend in ASYNC_REQUEST_FUNCS: | ||||
|         request_func = ASYNC_REQUEST_FUNCS[backend] | ||||
| @ -503,15 +502,9 @@ async def benchmark( | ||||
|  | ||||
|     pbar = None if disable_tqdm else tqdm(total=len(input_requests)) | ||||
|  | ||||
|     # This can be used once the minimum Python version is 3.10 or higher, | ||||
|     # and it will simplify the code in limited_request_func. | ||||
|     #    semaphore = (asyncio.Semaphore(max_concurrency) | ||||
|     #                 if max_concurrency else contextlib.nullcontext()) | ||||
|     semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None | ||||
|     semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else nullcontext() | ||||
|  | ||||
|     async def limited_request_func(request_func_input, pbar): | ||||
|         if semaphore is None: | ||||
|             return await request_func(request_func_input=request_func_input, pbar=pbar) | ||||
|         async with semaphore: | ||||
|             return await request_func(request_func_input=request_func_input, pbar=pbar) | ||||
|  | ||||
| @ -910,13 +903,13 @@ def create_argument_parser(): | ||||
|     parser.add_argument( | ||||
|         "--tokenizer", | ||||
|         type=str, | ||||
|         help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501 | ||||
|         help="Name or path of the tokenizer, if not using the default tokenizer.", | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "--tokenizer-mode", | ||||
|         type=str, | ||||
|         default="auto", | ||||
|         help="Name or path of the tokenizer, if not using the default tokenizer.",  # noqa: E501 | ||||
|         help="Name or path of the tokenizer, if not using the default tokenizer.", | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "--num-prompts", | ||||
|  | ||||
| @ -6,7 +6,7 @@ import math | ||||
| import os | ||||
| import time | ||||
| from types import TracebackType | ||||
| from typing import Any, Optional, Union | ||||
| from typing import Any | ||||
|  | ||||
|  | ||||
| def convert_to_pytorch_benchmark_format( | ||||
| @ -92,7 +92,7 @@ class TimeCollector: | ||||
|     def __init__(self, scale: int) -> None: | ||||
|         self.cnt: int = 0 | ||||
|         self._sum: int = 0 | ||||
|         self._max: Optional[int] = None | ||||
|         self._max: int | None = None | ||||
|         self.scale = scale | ||||
|         self.start_time: int = time.monotonic_ns() | ||||
|  | ||||
| @ -104,13 +104,13 @@ class TimeCollector: | ||||
|         else: | ||||
|             self._max = max(self._max, v) | ||||
|  | ||||
|     def avg(self) -> Union[float, str]: | ||||
|     def avg(self) -> float | str: | ||||
|         return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A" | ||||
|  | ||||
|     def max(self) -> Union[float, str]: | ||||
|     def max(self) -> float | str: | ||||
|         return self._max / self.scale if self._max else "N/A" | ||||
|  | ||||
|     def dump_avg_max(self) -> list[Union[float, str]]: | ||||
|     def dump_avg_max(self) -> list[float | str]: | ||||
|         return [self.avg(), self.max()] | ||||
|  | ||||
|     def __enter__(self) -> None: | ||||
| @ -118,8 +118,8 @@ class TimeCollector: | ||||
|  | ||||
|     def __exit__( | ||||
|         self, | ||||
|         exc_type: Optional[type[BaseException]], | ||||
|         exc_value: Optional[BaseException], | ||||
|         exc_traceback: Optional[TracebackType], | ||||
|         exc_type: type[BaseException] | None, | ||||
|         exc_value: BaseException | None, | ||||
|         exc_traceback: TracebackType | None, | ||||
|     ) -> None: | ||||
|         self.collect(time.monotonic_ns() - self.start_time) | ||||
|  | ||||
| @ -6,8 +6,7 @@ import copy | ||||
| import itertools | ||||
| import pickle as pkl | ||||
| import time | ||||
| from collections.abc import Iterable | ||||
| from typing import Callable | ||||
| from collections.abc import Callable, Iterable | ||||
|  | ||||
| import torch | ||||
| import torch.utils.benchmark as TBenchmark | ||||
|  | ||||
| @ -6,8 +6,7 @@ import copy | ||||
| import itertools | ||||
| import pickle as pkl | ||||
| import time | ||||
| from collections.abc import Iterable | ||||
| from typing import Callable, Optional | ||||
| from collections.abc import Callable, Iterable | ||||
|  | ||||
| import torch | ||||
| import torch.utils.benchmark as TBenchmark | ||||
| @ -17,7 +16,7 @@ from weight_shapes import WEIGHT_SHAPES | ||||
|  | ||||
| from vllm import _custom_ops as ops | ||||
| from vllm.model_executor.layers.quantization.utils.fp8_utils import ( | ||||
|     w8a8_block_fp8_matmul, | ||||
|     w8a8_triton_block_scaled_mm, | ||||
| ) | ||||
| from vllm.utils import FlexibleArgumentParser, cdiv | ||||
|  | ||||
| @ -53,7 +52,7 @@ def bench_int8( | ||||
|     n: int, | ||||
|     label: str, | ||||
|     sub_label: str, | ||||
|     bench_kernels: Optional[list[str]] = None, | ||||
|     bench_kernels: list[str] | None = None, | ||||
| ) -> Iterable[TMeasurement]: | ||||
|     """Benchmark INT8-based kernels.""" | ||||
|     assert dtype == torch.int8 | ||||
| @ -108,7 +107,7 @@ def bench_fp8( | ||||
|     n: int, | ||||
|     label: str, | ||||
|     sub_label: str, | ||||
|     bench_kernels: Optional[list[str]] = None, | ||||
|     bench_kernels: list[str] | None = None, | ||||
| ) -> Iterable[TMeasurement]: | ||||
|     """Benchmark FP8-based kernels.""" | ||||
|     assert dtype == torch.float8_e4m3fn | ||||
| @ -158,7 +157,7 @@ def bench_fp8( | ||||
|         "cutlass_fp8_fp8_fp16_scaled_mm_bias": lambda: ops.cutlass_scaled_mm( | ||||
|             a, b, scale_a, scale_b, torch.float16, bias.to(dtype=torch.float16) | ||||
|         ), | ||||
|         "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_block_fp8_matmul( | ||||
|         "triton_fp8_fp8_fp16_scaled_mm_blockwise": lambda: w8a8_triton_block_scaled_mm( | ||||
|             a_cont, b.t(), block_scale_a, block_scale_b.t(), (128, 128) | ||||
|         ), | ||||
|         "cutlass_fp8_fp8_fp16_scaled_mm_blockwise": lambda: ops.cutlass_scaled_mm( | ||||
| @ -183,7 +182,7 @@ def bench( | ||||
|     n: int, | ||||
|     label: str, | ||||
|     sub_label: str, | ||||
|     bench_kernels: Optional[list[str]] = None, | ||||
|     bench_kernels: list[str] | None = None, | ||||
| ) -> Iterable[TMeasurement]: | ||||
|     if dtype == torch.int8: | ||||
|         return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels) | ||||
| @ -201,7 +200,7 @@ def print_timers(timers: Iterable[TMeasurement]): | ||||
| def run( | ||||
|     dtype: torch.dtype, | ||||
|     MKNs: Iterable[tuple[int, int, int]], | ||||
|     bench_kernels: Optional[list[str]] = None, | ||||
|     bench_kernels: list[str] | None = None, | ||||
| ) -> Iterable[TMeasurement]: | ||||
|     results = [] | ||||
|     for m, k, n in MKNs: | ||||
|  | ||||
| @ -55,9 +55,7 @@ benchmark() { | ||||
|   output_len=$2 | ||||
|  | ||||
|  | ||||
|   CUDA_VISIBLE_DEVICES=0 python3 \ | ||||
|     -m vllm.entrypoints.openai.api_server \ | ||||
|     --model $model \ | ||||
|   CUDA_VISIBLE_DEVICES=0 vllm serve $model \ | ||||
|     --port 8100 \ | ||||
|     --max-model-len 10000 \ | ||||
|     --gpu-memory-utilization 0.6 \ | ||||
| @ -65,9 +63,7 @@ benchmark() { | ||||
|     '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & | ||||
|  | ||||
|  | ||||
|   CUDA_VISIBLE_DEVICES=1 python3 \ | ||||
|     -m vllm.entrypoints.openai.api_server \ | ||||
|     --model $model \ | ||||
|   CUDA_VISIBLE_DEVICES=1 vllm serve $model \ | ||||
|     --port 8200 \ | ||||
|     --max-model-len 10000 \ | ||||
|     --gpu-memory-utilization 0.6 \ | ||||
|  | ||||
| @ -38,16 +38,12 @@ wait_for_server() { | ||||
| launch_chunked_prefill() { | ||||
|   model="meta-llama/Meta-Llama-3.1-8B-Instruct" | ||||
|   # disagg prefill | ||||
|   CUDA_VISIBLE_DEVICES=0 python3 \ | ||||
|     -m vllm.entrypoints.openai.api_server \ | ||||
|     --model $model \ | ||||
|   CUDA_VISIBLE_DEVICES=0 vllm serve $model \ | ||||
|     --port 8100 \ | ||||
|     --max-model-len 10000 \ | ||||
|     --enable-chunked-prefill \ | ||||
|     --gpu-memory-utilization 0.6 & | ||||
|   CUDA_VISIBLE_DEVICES=1 python3 \ | ||||
|     -m vllm.entrypoints.openai.api_server \ | ||||
|     --model $model \ | ||||
|   CUDA_VISIBLE_DEVICES=1 vllm serve $model \ | ||||
|     --port 8200 \ | ||||
|     --max-model-len 10000 \ | ||||
|     --enable-chunked-prefill \ | ||||
| @ -62,18 +58,14 @@ launch_chunked_prefill() { | ||||
| launch_disagg_prefill() { | ||||
|   model="meta-llama/Meta-Llama-3.1-8B-Instruct" | ||||
|   # disagg prefill | ||||
|   CUDA_VISIBLE_DEVICES=0 python3 \ | ||||
|     -m vllm.entrypoints.openai.api_server \ | ||||
|     --model $model \ | ||||
|   CUDA_VISIBLE_DEVICES=0 vllm serve $model \ | ||||
|     --port 8100 \ | ||||
|     --max-model-len 10000 \ | ||||
|     --gpu-memory-utilization 0.6 \ | ||||
|     --kv-transfer-config \ | ||||
|     '{"kv_connector":"P2pNcclConnector","kv_role":"kv_producer","kv_rank":0,"kv_parallel_size":2,"kv_buffer_size":5e9}' & | ||||
|  | ||||
|   CUDA_VISIBLE_DEVICES=1 python3 \ | ||||
|     -m vllm.entrypoints.openai.api_server \ | ||||
|     --model $model \ | ||||
|   CUDA_VISIBLE_DEVICES=1 vllm serve $model \ | ||||
|     --port 8200 \ | ||||
|     --max-model-len 10000 \ | ||||
|     --gpu-memory-utilization 0.6 \ | ||||
|  | ||||
| @ -3,10 +3,9 @@ | ||||
|  | ||||
| import pickle as pkl | ||||
| import time | ||||
| from collections.abc import Iterable | ||||
| from collections.abc import Callable, Iterable | ||||
| from dataclasses import dataclass | ||||
| from itertools import product | ||||
| from typing import Callable, Optional | ||||
|  | ||||
| import torch | ||||
| import torch.utils.benchmark as TBenchmark | ||||
| @ -51,7 +50,7 @@ def get_bench_params() -> list[bench_params_t]: | ||||
| def unfused_int8_impl( | ||||
|     rms_norm_layer: RMSNorm, | ||||
|     x: torch.Tensor, | ||||
|     residual: Optional[torch.Tensor], | ||||
|     residual: torch.Tensor | None, | ||||
|     quant_dtype: torch.dtype, | ||||
| ): | ||||
|     # Norm | ||||
| @ -68,7 +67,7 @@ def unfused_int8_impl( | ||||
| def unfused_fp8_impl( | ||||
|     rms_norm_layer: RMSNorm, | ||||
|     x: torch.Tensor, | ||||
|     residual: Optional[torch.Tensor], | ||||
|     residual: torch.Tensor | None, | ||||
|     quant_dtype: torch.dtype, | ||||
| ): | ||||
|     # Norm | ||||
| @ -85,7 +84,7 @@ def unfused_fp8_impl( | ||||
| def fused_impl( | ||||
|     rms_norm_layer: RMSNorm,  # this stores the weights | ||||
|     x: torch.Tensor, | ||||
|     residual: Optional[torch.Tensor], | ||||
|     residual: torch.Tensor | None, | ||||
|     quant_dtype: torch.dtype, | ||||
| ): | ||||
|     out, _ = ops.rms_norm_dynamic_per_token_quant( | ||||
|  | ||||
							
								
								
									
										191
									
								
								benchmarks/kernels/bench_mxfp4_qutlass.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										191
									
								
								benchmarks/kernels/bench_mxfp4_qutlass.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,191 @@ | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
| # | ||||
| # Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at). | ||||
| # All Rights Reserved. | ||||
| # | ||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| # you may not use this file except in compliance with the License. | ||||
| # You may obtain a copy of the License at | ||||
| # | ||||
| #       http://www.apache.org/licenses/LICENSE-2.0 | ||||
| # | ||||
| # Unless required by applicable law or agreed to in writing, software | ||||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
| # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | ||||
| # | ||||
|  | ||||
| import argparse | ||||
| import copy | ||||
| import itertools | ||||
|  | ||||
| import torch | ||||
| from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix | ||||
| from weight_shapes import WEIGHT_SHAPES | ||||
|  | ||||
| from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn | ||||
| from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked | ||||
| from vllm.triton_utils import triton | ||||
|  | ||||
| PROVIDER_CFGS = { | ||||
|     "torch-bf16": dict(enabled=True), | ||||
|     "mxfp4": dict(no_a_quant=False, enabled=True), | ||||
|     "mxfp4-noquant": dict(no_a_quant=True, enabled=True), | ||||
| } | ||||
|  | ||||
| _enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]] | ||||
|  | ||||
|  | ||||
| def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device): | ||||
|     return ( | ||||
|         deterministic_hadamard_matrix(group_size, dtype=dtype, device=device) | ||||
|         * group_size**-0.5 | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def _quant_weight_mxfp4( | ||||
|     b: torch.Tensor, forward_hadamard_matrix: torch.Tensor, device: str | ||||
| ): | ||||
|     weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeMx( | ||||
|         b, forward_hadamard_matrix, method="abs_max" | ||||
|     ) | ||||
|     weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton") | ||||
|     return weight_hf_e2m1, weight_hf_scale_block | ||||
|  | ||||
|  | ||||
| def build_mxfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device): | ||||
|     weight_hf_e2m1, weight_hf_scale_block = _quant_weight_mxfp4( | ||||
|         b, forward_hadamard_matrix, device | ||||
|     ) | ||||
|     alpha = torch.tensor([1.0], device="cuda") | ||||
|  | ||||
|     if cfg["no_a_quant"]: | ||||
|         # Pre-quantize activation | ||||
|         input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx( | ||||
|             a, forward_hadamard_matrix, method="abs_max" | ||||
|         ) | ||||
|         input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton") | ||||
|  | ||||
|         def run(): | ||||
|             return matmul_mxf4_bf16_tn( | ||||
|                 input_hf_e2m1, | ||||
|                 weight_hf_e2m1, | ||||
|                 input_hf_scale_block, | ||||
|                 weight_hf_scale_block, | ||||
|                 alpha, | ||||
|             ) | ||||
|  | ||||
|         return run | ||||
|  | ||||
|     # Quantize activation on-the-fly | ||||
|     def run(): | ||||
|         input_hf_e2m1, input_hf_e8m0 = fusedQuantizeMx( | ||||
|             a, forward_hadamard_matrix, method="abs_max" | ||||
|         ) | ||||
|         input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton") | ||||
|         return matmul_mxf4_bf16_tn( | ||||
|             input_hf_e2m1, | ||||
|             weight_hf_e2m1, | ||||
|             input_hf_scale_block, | ||||
|             weight_hf_scale_block, | ||||
|             alpha, | ||||
|         ) | ||||
|  | ||||
|     return run | ||||
|  | ||||
|  | ||||
| @triton.testing.perf_report( | ||||
|     triton.testing.Benchmark( | ||||
|         x_names=["batch_size"], | ||||
|         x_vals=[ | ||||
|             1, | ||||
|             4, | ||||
|             8, | ||||
|             16, | ||||
|             32, | ||||
|             64, | ||||
|             128, | ||||
|             256, | ||||
|             512, | ||||
|             1024, | ||||
|             2048, | ||||
|             4096, | ||||
|             8192, | ||||
|             16384, | ||||
|             24576, | ||||
|             32768, | ||||
|         ], | ||||
|         x_log=False, | ||||
|         line_arg="provider", | ||||
|         line_vals=_enabled, | ||||
|         line_names=_enabled, | ||||
|         ylabel="TFLOP/s (larger is better)", | ||||
|         plot_name="BF16 vs MXFP4 GEMMs", | ||||
|         args={}, | ||||
|     ) | ||||
| ) | ||||
| def benchmark(batch_size, provider, N, K, had_size): | ||||
|     M = batch_size | ||||
|     device = "cuda" | ||||
|     dtype = torch.bfloat16 | ||||
|  | ||||
|     a = torch.randn((M, K), device=device, dtype=dtype) | ||||
|     b = torch.randn((N, K), device=device, dtype=dtype) | ||||
|     forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device) | ||||
|  | ||||
|     quantiles = [0.5, 0.2, 0.8] | ||||
|  | ||||
|     if provider == "torch-bf16": | ||||
|         ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( | ||||
|             lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles | ||||
|         ) | ||||
|     else: | ||||
|         cfg = PROVIDER_CFGS[provider] | ||||
|         run_quant = build_mxfp4_runner( | ||||
|             cfg, a, b, forward_hadamard_matrix, dtype, device | ||||
|         ) | ||||
|         ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( | ||||
|             lambda: run_quant(), rep=200, quantiles=quantiles | ||||
|         ) | ||||
|  | ||||
|     to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3) | ||||
|     return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms) | ||||
|  | ||||
|  | ||||
| def prepare_shapes(args): | ||||
|     out = [] | ||||
|     for model, tp_size in itertools.product(args.models, args.tp_sizes): | ||||
|         for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]): | ||||
|             KN[tp_dim] //= tp_size | ||||
|             KN.append(model) | ||||
|             out.append(KN) | ||||
|     return out | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     parser = argparse.ArgumentParser() | ||||
|     parser.add_argument( | ||||
|         "--models", | ||||
|         nargs="+", | ||||
|         type=str, | ||||
|         default=["meta-llama/Llama-3.3-70B-Instruct"], | ||||
|         choices=list(WEIGHT_SHAPES.keys()), | ||||
|     ) | ||||
|     parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1]) | ||||
|     args = parser.parse_args() | ||||
|  | ||||
|     for K, N, model in prepare_shapes(args): | ||||
|         for had_size in [32, 64, 128]: | ||||
|             print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs MXFP4 GEMMs TFLOP/s:") | ||||
|             benchmark.run( | ||||
|                 print_data=True, | ||||
|                 show_plots=True, | ||||
|                 save_path=f"bench_mxfp4_res_n{N}_k{K}", | ||||
|                 N=N, | ||||
|                 K=K, | ||||
|                 had_size=had_size, | ||||
|             ) | ||||
|  | ||||
|     print("Benchmark finished!") | ||||
							
								
								
									
										207
									
								
								benchmarks/kernels/bench_nvfp4_qutlass.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										207
									
								
								benchmarks/kernels/bench_nvfp4_qutlass.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,207 @@ | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
| # | ||||
| # Copyright (C) 2025 Roberto L. Castro (Roberto.LopezCastro@ist.ac.at). | ||||
| # All Rights Reserved. | ||||
| # | ||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||||
| # you may not use this file except in compliance with the License. | ||||
| # You may obtain a copy of the License at | ||||
| # | ||||
| #       http://www.apache.org/licenses/LICENSE-2.0 | ||||
| # | ||||
| # Unless required by applicable law or agreed to in writing, software | ||||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||||
| # See the License for the specific language governing permissions and | ||||
| # limitations under the License. | ||||
| # | ||||
|  | ||||
| import argparse | ||||
| import copy | ||||
| import itertools | ||||
|  | ||||
| import torch | ||||
| from compressed_tensors.transform.utils.hadamard import deterministic_hadamard_matrix | ||||
| from weight_shapes import WEIGHT_SHAPES | ||||
|  | ||||
| from vllm import _custom_ops as ops  # use existing nvfp4 gemm in vllm | ||||
| from vllm._custom_ops import fusedQuantizeNv | ||||
| from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked | ||||
| from vllm.triton_utils import triton | ||||
|  | ||||
| PROVIDER_CFGS = { | ||||
|     "torch-bf16": dict(enabled=True), | ||||
|     "nvfp4": dict(no_a_quant=False, enabled=True), | ||||
|     "nvfp4-noquant": dict(no_a_quant=True, enabled=True), | ||||
| } | ||||
|  | ||||
| _enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]] | ||||
|  | ||||
|  | ||||
| def get_hadamard_matrix(group_size: int, dtype: torch.dtype, device: torch.device): | ||||
|     return ( | ||||
|         deterministic_hadamard_matrix(group_size, dtype=dtype, device=device) | ||||
|         * group_size**-0.5 | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def _quant_weight_nvfp4( | ||||
|     b: torch.Tensor, | ||||
|     forward_hadamard_matrix: torch.Tensor, | ||||
|     global_scale: torch.Tensor, | ||||
|     device: str, | ||||
|     M: int, | ||||
|     N: int, | ||||
|     K: int, | ||||
| ): | ||||
|     weight_hf_e2m1, weight_hf_e8m0 = fusedQuantizeNv( | ||||
|         b, forward_hadamard_matrix, global_scale | ||||
|     ) | ||||
|     weight_hf_scale_block = to_blocked(weight_hf_e8m0, backend="triton").view( | ||||
|         -1, K // 16 | ||||
|     ) | ||||
|     return weight_hf_e2m1, weight_hf_scale_block | ||||
|  | ||||
|  | ||||
| def build_nvfp4_runner(cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K): | ||||
|     alpha = torch.tensor([1.0], device="cuda") | ||||
|     global_scale = torch.tensor([1.0], device="cuda") | ||||
|     weight_hf_e2m1, weight_hf_scale_block = _quant_weight_nvfp4( | ||||
|         b, forward_hadamard_matrix, global_scale, device, M, N, K | ||||
|     ) | ||||
|  | ||||
|     if cfg["no_a_quant"]: | ||||
|         # Pre-quantize activation | ||||
|         input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv( | ||||
|             a, forward_hadamard_matrix, global_scale | ||||
|         ) | ||||
|         input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view( | ||||
|             -1, K // 16 | ||||
|         ) | ||||
|  | ||||
|         def run(): | ||||
|             return ops.cutlass_scaled_fp4_mm( | ||||
|                 input_hf_e2m1, | ||||
|                 weight_hf_e2m1, | ||||
|                 input_hf_scale_block, | ||||
|                 weight_hf_scale_block, | ||||
|                 alpha, | ||||
|                 torch.bfloat16, | ||||
|             ) | ||||
|  | ||||
|         return run | ||||
|  | ||||
|     # Quantize activation on-the-fly | ||||
|     def run(): | ||||
|         input_hf_e2m1, input_hf_e8m0 = fusedQuantizeNv( | ||||
|             a, forward_hadamard_matrix, global_scale | ||||
|         ) | ||||
|         input_hf_scale_block = to_blocked(input_hf_e8m0, backend="triton").view( | ||||
|             -1, K // 16 | ||||
|         ) | ||||
|         return ops.cutlass_scaled_fp4_mm( | ||||
|             input_hf_e2m1, | ||||
|             weight_hf_e2m1, | ||||
|             input_hf_scale_block, | ||||
|             weight_hf_scale_block, | ||||
|             alpha, | ||||
|             torch.bfloat16, | ||||
|         ) | ||||
|  | ||||
|     return run | ||||
|  | ||||
|  | ||||
| @triton.testing.perf_report( | ||||
|     triton.testing.Benchmark( | ||||
|         x_names=["batch_size"], | ||||
|         x_vals=[ | ||||
|             1, | ||||
|             4, | ||||
|             8, | ||||
|             16, | ||||
|             32, | ||||
|             64, | ||||
|             128, | ||||
|             256, | ||||
|             512, | ||||
|             1024, | ||||
|             2048, | ||||
|             4096, | ||||
|             8192, | ||||
|             16384, | ||||
|             24576, | ||||
|             32768, | ||||
|         ], | ||||
|         x_log=False, | ||||
|         line_arg="provider", | ||||
|         line_vals=_enabled, | ||||
|         line_names=_enabled, | ||||
|         ylabel="TFLOP/s (larger is better)", | ||||
|         plot_name="BF16 vs NVFP4 GEMMs", | ||||
|         args={}, | ||||
|     ) | ||||
| ) | ||||
| def benchmark(batch_size, provider, N, K, had_size): | ||||
|     M = batch_size | ||||
|     device = "cuda" | ||||
|     dtype = torch.bfloat16 | ||||
|  | ||||
|     a = torch.randn((M, K), device=device, dtype=dtype) | ||||
|     b = torch.randn((N, K), device=device, dtype=dtype) | ||||
|     forward_hadamard_matrix = get_hadamard_matrix(had_size, dtype, device) | ||||
|  | ||||
|     quantiles = [0.5, 0.2, 0.8] | ||||
|  | ||||
|     if provider == "torch-bf16": | ||||
|         ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( | ||||
|             lambda: torch.nn.functional.linear(a, b), rep=200, quantiles=quantiles | ||||
|         ) | ||||
|     else: | ||||
|         cfg = PROVIDER_CFGS[provider] | ||||
|         run_quant = build_nvfp4_runner( | ||||
|             cfg, a, b, forward_hadamard_matrix, dtype, device, M, N, K | ||||
|         ) | ||||
|         ms, min_ms, max_ms = triton.testing.do_bench_cudagraph( | ||||
|             lambda: run_quant(), rep=200, quantiles=quantiles | ||||
|         ) | ||||
|  | ||||
|     to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3) | ||||
|     return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms) | ||||
|  | ||||
|  | ||||
| def prepare_shapes(args): | ||||
|     out = [] | ||||
|     for model, tp_size in itertools.product(args.models, args.tp_sizes): | ||||
|         for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]): | ||||
|             KN[tp_dim] //= tp_size | ||||
|             KN.append(model) | ||||
|             out.append(KN) | ||||
|     return out | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     parser = argparse.ArgumentParser() | ||||
|     parser.add_argument( | ||||
|         "--models", | ||||
|         nargs="+", | ||||
|         type=str, | ||||
|         default=["meta-llama/Llama-3.3-70B-Instruct"], | ||||
|         choices=list(WEIGHT_SHAPES.keys()), | ||||
|     ) | ||||
|     parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1]) | ||||
|     args = parser.parse_args() | ||||
|  | ||||
|     for K, N, model in prepare_shapes(args): | ||||
|         for had_size in [16, 32, 64, 128]: | ||||
|             print(f"{model}, N={N} K={K}, HAD={had_size}, BF16 vs NVFP4 GEMMs TFLOP/s:") | ||||
|             benchmark.run( | ||||
|                 print_data=True, | ||||
|                 show_plots=True, | ||||
|                 save_path=f"bench_nvfp4_res_n{N}_k{K}", | ||||
|                 N=N, | ||||
|                 K=K, | ||||
|                 had_size=had_size, | ||||
|             ) | ||||
|  | ||||
|     print("Benchmark finished!") | ||||
| @ -1,7 +1,7 @@ | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
| import itertools | ||||
| from typing import Callable | ||||
| from collections.abc import Callable | ||||
| from unittest.mock import patch | ||||
|  | ||||
| import pandas as pd | ||||
| @ -10,7 +10,8 @@ import torch | ||||
| from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 | ||||
| from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape | ||||
| from vllm.triton_utils import triton | ||||
| from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser | ||||
| from vllm.utils import FlexibleArgumentParser | ||||
| from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE | ||||
|  | ||||
|  | ||||
| def with_triton_mode(fn): | ||||
|  | ||||
| @ -10,7 +10,8 @@ import vllm.model_executor.layers.activation  # noqa F401 | ||||
| from vllm.model_executor.custom_op import CustomOp | ||||
| from vllm.platforms import current_platform | ||||
| from vllm.triton_utils import triton | ||||
| from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser | ||||
| from vllm.utils import FlexibleArgumentParser | ||||
| from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE | ||||
|  | ||||
| batch_size_range = [1, 16, 32, 64, 128] | ||||
| seq_len_range = [1, 16, 64, 128, 256, 512, 1024, 2048, 4096] | ||||
|  | ||||
| @ -22,8 +22,8 @@ Example: | ||||
| import json | ||||
| import os | ||||
| import time | ||||
| from collections.abc import Callable | ||||
| from contextlib import nullcontext | ||||
| from typing import Callable, Optional | ||||
|  | ||||
| import torch | ||||
| import torch.distributed as dist | ||||
| @ -264,12 +264,12 @@ class CommunicatorBenchmark: | ||||
|     def benchmark_allreduce_single( | ||||
|         self, | ||||
|         sequence_length: int, | ||||
|         allreduce_fn: Callable[[torch.Tensor], Optional[torch.Tensor]], | ||||
|         allreduce_fn: Callable[[torch.Tensor], torch.Tensor | None], | ||||
|         should_use_fn: Callable[[torch.Tensor], bool], | ||||
|         context, | ||||
|         num_warmup: int, | ||||
|         num_trials: int, | ||||
|     ) -> Optional[float]: | ||||
|     ) -> float | None: | ||||
|         """Benchmark method with CUDA graph optimization.""" | ||||
|         try: | ||||
|             # Create test tensor (2D: sequence_length x hidden_size) | ||||
|  | ||||
| @ -7,7 +7,8 @@ import torch | ||||
|  | ||||
| from vllm.model_executor.layers.layernorm import RMSNorm | ||||
| from vllm.platforms import current_platform | ||||
| from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser | ||||
| from vllm.utils import FlexibleArgumentParser | ||||
| from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE | ||||
|  | ||||
|  | ||||
| @torch.inference_mode() | ||||
|  | ||||
| @ -6,11 +6,12 @@ import copy | ||||
| import json | ||||
| import pickle | ||||
| import time | ||||
| from collections.abc import Callable | ||||
| from dataclasses import dataclass | ||||
| from enum import Enum, auto | ||||
| from itertools import product | ||||
| from pathlib import Path | ||||
| from typing import Any, Callable, Optional | ||||
| from typing import Any | ||||
|  | ||||
| import torch | ||||
| import torch.utils.benchmark as TBenchmark | ||||
| @ -158,7 +159,7 @@ def ref_group_gemm( | ||||
|     seq_lens_cpu: torch.Tensor, | ||||
|     prompt_lora_mapping_cpu: torch.Tensor, | ||||
|     scaling: float, | ||||
|     add_inputs: Optional[bool], | ||||
|     add_inputs: bool | None, | ||||
| ): | ||||
|     """ | ||||
|     Torch group gemm reference implementation to test correctness of | ||||
| @ -316,8 +317,8 @@ class BenchmarkContext: | ||||
|     lora_rank: int | ||||
|     sort_by_lora_id: bool | ||||
|     dtype: torch.dtype | ||||
|     seq_length: Optional[int] = None | ||||
|     num_slices: Optional[int] = None  # num_slices for slice based ops | ||||
|     seq_length: int | None = None | ||||
|     num_slices: int | None = None  # num_slices for slice based ops | ||||
|  | ||||
|     def with_seq_length(self, seq_length: int) -> "BenchmarkContext": | ||||
|         ctx = copy.copy(self) | ||||
| @ -561,7 +562,7 @@ class BenchmarkTensors: | ||||
|         } | ||||
|  | ||||
|     def bench_fn_kwargs( | ||||
|         self, op_type: OpType, add_inputs: Optional[bool] = None | ||||
|         self, op_type: OpType, add_inputs: bool | None = None | ||||
|     ) -> dict[str, Any]: | ||||
|         if op_type.is_shrink_fn(): | ||||
|             assert add_inputs is None | ||||
| @ -575,7 +576,7 @@ class BenchmarkTensors: | ||||
|         raise ValueError(f"Unrecognized optype {self}") | ||||
|  | ||||
|     def test_correctness( | ||||
|         self, op_type: OpType, expand_fn_add_inputs: Optional[bool] | ||||
|         self, op_type: OpType, expand_fn_add_inputs: bool | None | ||||
|     ) -> bool: | ||||
|         """ | ||||
|         Test correctness of op_type implementation against a grouped gemm | ||||
| @ -611,8 +612,8 @@ def bench_optype( | ||||
|     ctx: BenchmarkContext, | ||||
|     arg_pool_size: int, | ||||
|     op_type: OpType, | ||||
|     cuda_graph_nops: Optional[int] = None, | ||||
|     expand_fn_add_inputs: Optional[bool] = None, | ||||
|     cuda_graph_nops: int | None = None, | ||||
|     expand_fn_add_inputs: bool | None = None, | ||||
|     test_correctness: bool = False, | ||||
| ) -> TMeasurement: | ||||
|     assert arg_pool_size >= 1 | ||||
| @ -679,7 +680,7 @@ def bench_torch_mm( | ||||
|     ctx: BenchmarkContext, | ||||
|     arg_pool_size: int, | ||||
|     op_type: OpType, | ||||
|     cuda_graph_nops: Optional[int] = None, | ||||
|     cuda_graph_nops: int | None = None, | ||||
| ) -> TMeasurement: | ||||
|     """ | ||||
|     Benchmark basic torch.mm as a roofline. | ||||
| @ -744,7 +745,7 @@ def use_cuda_graph_recommendation() -> str: | ||||
|             """ | ||||
|  | ||||
|  | ||||
| def print_timers(timers: list[TMeasurement], args: Optional[argparse.Namespace] = None): | ||||
| def print_timers(timers: list[TMeasurement], args: argparse.Namespace | None = None): | ||||
|     compare = TBenchmark.Compare(timers) | ||||
|     compare.print() | ||||
|  | ||||
|  | ||||
| @ -8,10 +8,9 @@ import math | ||||
| import os | ||||
| import pickle as pkl | ||||
| import time | ||||
| from collections.abc import Iterable | ||||
| from collections.abc import Callable, Iterable | ||||
| from dataclasses import dataclass | ||||
| from itertools import product | ||||
| from typing import Callable, Optional | ||||
|  | ||||
| import pandas as pd | ||||
| import torch | ||||
| @ -63,23 +62,23 @@ class BenchmarkTensors: | ||||
|     a: torch.Tensor | ||||
|  | ||||
|     w_q: torch.Tensor | ||||
|     group_size: Optional[int] | ||||
|     group_size: int | None | ||||
|     wtype: ScalarType | ||||
|     w_g_s: torch.Tensor | ||||
|     w_g_zp: Optional[torch.Tensor] | ||||
|     w_ch_s: Optional[torch.Tensor] | ||||
|     w_tok_s: Optional[torch.Tensor] | ||||
|     w_g_zp: torch.Tensor | None | ||||
|     w_ch_s: torch.Tensor | None | ||||
|     w_tok_s: torch.Tensor | None | ||||
|  | ||||
|  | ||||
| @dataclass | ||||
| class TypeConfig: | ||||
|     act_type: torch.dtype | ||||
|     weight_type: ScalarType | ||||
|     output_type: Optional[torch.dtype] | ||||
|     group_scale_type: Optional[torch.dtype] | ||||
|     group_zero_type: Optional[torch.dtype] | ||||
|     channel_scale_type: Optional[torch.dtype] | ||||
|     token_scale_type: Optional[torch.dtype] | ||||
|     output_type: torch.dtype | None | ||||
|     group_scale_type: torch.dtype | None | ||||
|     group_zero_type: torch.dtype | None | ||||
|     channel_scale_type: torch.dtype | None | ||||
|     token_scale_type: torch.dtype | None | ||||
|  | ||||
|  | ||||
| def rand_data(shape, dtype=torch.float16, scale=1): | ||||
| @ -93,8 +92,8 @@ def quantize_and_pack( | ||||
|     atype: torch.dtype, | ||||
|     w: torch.Tensor, | ||||
|     wtype: ScalarType, | ||||
|     stype: Optional[torch.dtype], | ||||
|     group_size: Optional[int], | ||||
|     stype: torch.dtype | None, | ||||
|     group_size: int | None, | ||||
|     zero_points: bool = False, | ||||
| ): | ||||
|     assert wtype.is_integer(), "TODO: support floating point weights" | ||||
| @ -113,7 +112,7 @@ def quantize_and_pack( | ||||
|  | ||||
|  | ||||
| def create_bench_tensors( | ||||
|     shape: tuple[int, int, int], types: TypeConfig, group_size: Optional[int] | ||||
|     shape: tuple[int, int, int], types: TypeConfig, group_size: int | None | ||||
| ) -> list[BenchmarkTensors]: | ||||
|     m, n, k = shape | ||||
|  | ||||
| @ -331,8 +330,8 @@ def bench_fns(label: str, sub_label: str, description: str, fns: list[Callable]) | ||||
|     return res | ||||
|  | ||||
|  | ||||
| _SWEEP_SCHEDULES_RESULTS: Optional[pd.DataFrame] = None | ||||
| _SWEEP_SCHEDULES_RESULTS_CSV: Optional[str] = None | ||||
| _SWEEP_SCHEDULES_RESULTS: pd.DataFrame | None = None | ||||
| _SWEEP_SCHEDULES_RESULTS_CSV: str | None = None | ||||
|  | ||||
|  | ||||
| def bench( | ||||
|  | ||||
| @ -579,18 +579,22 @@ def main(args: argparse.Namespace): | ||||
|         E = config.ffn_config.moe_num_experts | ||||
|         topk = config.ffn_config.moe_top_k | ||||
|         intermediate_size = config.ffn_config.ffn_hidden_size | ||||
|         hidden_size = config.hidden_size | ||||
|     elif config.architectures[0] == "JambaForCausalLM": | ||||
|         E = config.num_experts | ||||
|         topk = config.num_experts_per_tok | ||||
|         intermediate_size = config.intermediate_size | ||||
|         hidden_size = config.hidden_size | ||||
|     elif config.architectures[0] in ( | ||||
|         "DeepseekV3ForCausalLM", | ||||
|         "DeepseekV2ForCausalLM", | ||||
|         "DeepseekV3ForCausalLM", | ||||
|         "DeepseekV32ForCausalLM", | ||||
|         "Glm4MoeForCausalLM", | ||||
|     ): | ||||
|         E = config.n_routed_experts | ||||
|         topk = config.num_experts_per_tok | ||||
|         intermediate_size = config.moe_intermediate_size | ||||
|         hidden_size = config.hidden_size | ||||
|     elif config.architectures[0] in ( | ||||
|         "Qwen2MoeForCausalLM", | ||||
|         "Qwen3MoeForCausalLM", | ||||
| @ -599,10 +603,18 @@ def main(args: argparse.Namespace): | ||||
|         E = config.num_experts | ||||
|         topk = config.num_experts_per_tok | ||||
|         intermediate_size = config.moe_intermediate_size | ||||
|         hidden_size = config.hidden_size | ||||
|     elif config.architectures[0] == "Qwen3VLMoeForConditionalGeneration": | ||||
|         text_config = config.get_text_config() | ||||
|         E = text_config.num_experts | ||||
|         topk = text_config.num_experts_per_tok | ||||
|         intermediate_size = text_config.moe_intermediate_size | ||||
|         hidden_size = text_config.hidden_size | ||||
|     elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"): | ||||
|         E = config.num_experts | ||||
|         topk = config.moe_topk[0] | ||||
|         intermediate_size = config.moe_intermediate_size[0] | ||||
|         hidden_size = config.hidden_size | ||||
|     else: | ||||
|         # Support for llama4 | ||||
|         config = config.get_text_config() | ||||
| @ -610,6 +622,7 @@ def main(args: argparse.Namespace): | ||||
|         E = config.num_local_experts | ||||
|         topk = config.num_experts_per_tok | ||||
|         intermediate_size = config.intermediate_size | ||||
|         hidden_size = config.hidden_size | ||||
|     enable_ep = bool(args.enable_expert_parallel) | ||||
|     if enable_ep: | ||||
|         ensure_divisibility(E, args.tp_size, "Number of experts") | ||||
| @ -618,8 +631,7 @@ def main(args: argparse.Namespace): | ||||
|     else: | ||||
|         ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size") | ||||
|         shard_intermediate_size = 2 * intermediate_size // args.tp_size | ||||
|     hidden_size = config.hidden_size | ||||
|     dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype | ||||
|     dtype = torch.float16 if current_platform.is_rocm() else config.dtype | ||||
|     use_fp8_w8a8 = args.dtype == "fp8_w8a8" | ||||
|     use_int8_w8a16 = args.dtype == "int8_w8a16" | ||||
|     block_quant_shape = get_weight_block_size_safety(config) | ||||
|  | ||||
| @ -344,7 +344,7 @@ def main(args: argparse.Namespace): | ||||
|         topk = config.num_experts_per_tok | ||||
|  | ||||
|     hidden_size = config.hidden_size | ||||
|     dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype | ||||
|     dtype = torch.float16 if current_platform.is_rocm() else config.dtype | ||||
|     use_fp8_w8a8 = args.dtype == "fp8_w8a8" | ||||
|     use_int8_w8a16 = args.dtype == "int8_w8a16" | ||||
|     use_customized_permute = args.use_customized_permute | ||||
|  | ||||
| @ -3,16 +3,15 @@ | ||||
|  | ||||
| import random | ||||
| import time | ||||
| from typing import Optional | ||||
|  | ||||
| import torch | ||||
|  | ||||
| from vllm import _custom_ops as ops | ||||
| from vllm.logger import init_logger | ||||
| from vllm.platforms import current_platform | ||||
| from vllm.utils import ( | ||||
| from vllm.utils import FlexibleArgumentParser | ||||
| from vllm.utils.torch_utils import ( | ||||
|     STR_DTYPE_TO_TORCH_DTYPE, | ||||
|     FlexibleArgumentParser, | ||||
|     create_kv_caches_with_random, | ||||
| ) | ||||
|  | ||||
| @ -37,7 +36,7 @@ def main( | ||||
|     seed: int, | ||||
|     do_profile: bool, | ||||
|     device: str = "cuda", | ||||
|     kv_cache_dtype: Optional[str] = None, | ||||
|     kv_cache_dtype: str | None = None, | ||||
| ) -> None: | ||||
|     current_platform.seed_everything(seed) | ||||
|  | ||||
|  | ||||
| @ -3,8 +3,8 @@ | ||||
|  | ||||
| import argparse | ||||
| import math | ||||
| from collections.abc import Callable | ||||
| from contextlib import contextmanager | ||||
| from typing import Callable | ||||
| from unittest.mock import patch | ||||
|  | ||||
| import torch | ||||
|  | ||||
| @ -1,155 +0,0 @@ | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
|  | ||||
| import itertools | ||||
|  | ||||
| import torch | ||||
|  | ||||
| from vllm import _custom_ops as vllm_ops | ||||
| from vllm.triton_utils import triton | ||||
|  | ||||
|  | ||||
| def polynorm_naive( | ||||
|     x: torch.Tensor, | ||||
|     weight: torch.Tensor, | ||||
|     bias: torch.Tensor, | ||||
|     eps: float = 1e-6, | ||||
| ): | ||||
|     orig_shape = x.shape | ||||
|     x = x.view(-1, x.shape[-1]) | ||||
|  | ||||
|     def norm(x, eps: float): | ||||
|         return x / torch.sqrt(x.pow(2).mean(-1, keepdim=True) + eps) | ||||
|  | ||||
|     x = x.float() | ||||
|     return ( | ||||
|         ( | ||||
|             weight[0] * norm(x**3, eps) | ||||
|             + weight[1] * norm(x**2, eps) | ||||
|             + weight[2] * norm(x, eps) | ||||
|             + bias | ||||
|         ) | ||||
|         .to(weight.dtype) | ||||
|         .view(orig_shape) | ||||
|     ) | ||||
|  | ||||
|  | ||||
| def polynorm_vllm( | ||||
|     x: torch.Tensor, | ||||
|     weight: torch.Tensor, | ||||
|     bias: torch.Tensor, | ||||
|     eps: float = 1e-6, | ||||
| ): | ||||
|     orig_shape = x.shape | ||||
|     x = x.view(-1, x.shape[-1]) | ||||
|  | ||||
|     out = torch.empty_like(x) | ||||
|     vllm_ops.poly_norm(out, x, weight, bias, eps) | ||||
|     output = out | ||||
|  | ||||
|     output = output.view(orig_shape) | ||||
|     return output | ||||
|  | ||||
|  | ||||
| def calculate_diff(batch_size, seq_len, hidden_dim): | ||||
|     dtype = torch.bfloat16 | ||||
|     x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda") | ||||
|     weight = torch.ones(3, dtype=dtype, device="cuda") | ||||
|     bias = torch.ones(1, dtype=dtype, device="cuda") | ||||
|  | ||||
|     output_naive = polynorm_naive(x, weight, bias) | ||||
|     output_vllm = polynorm_vllm(x, weight, bias) | ||||
|  | ||||
|     if torch.allclose(output_naive, output_vllm, atol=1e-2, rtol=1e-2): | ||||
|         print("✅ All implementations match") | ||||
|     else: | ||||
|         print("❌ Implementations differ") | ||||
|  | ||||
|  | ||||
| batch_size_range = [2**i for i in range(0, 7, 2)] | ||||
| seq_length_range = [2**i for i in range(6, 11, 1)] | ||||
| dim_range = [2048, 4096] | ||||
| configs = list(itertools.product(dim_range, batch_size_range, seq_length_range)) | ||||
|  | ||||
|  | ||||
| def get_benchmark(): | ||||
|     @triton.testing.perf_report( | ||||
|         triton.testing.Benchmark( | ||||
|             x_names=["dim", "batch_size", "seq_len"], | ||||
|             x_vals=[list(_) for _ in configs], | ||||
|             line_arg="provider", | ||||
|             line_vals=["naive", "vllm"], | ||||
|             line_names=["Naive", "vLLM"], | ||||
|             styles=[("blue", "-"), ("red", "-")], | ||||
|             ylabel="us", | ||||
|             plot_name="polynorm-perf", | ||||
|             args={}, | ||||
|         ) | ||||
|     ) | ||||
|     def benchmark(dim, batch_size, seq_len, provider): | ||||
|         dtype = torch.bfloat16 | ||||
|         hidden_dim = dim * 4 | ||||
|  | ||||
|         x = torch.randn(batch_size, seq_len, hidden_dim, dtype=dtype, device="cuda") | ||||
|         weight = torch.ones(3, dtype=dtype, device="cuda") | ||||
|         bias = torch.ones(1, dtype=dtype, device="cuda") | ||||
|  | ||||
|         quantiles = [0.5, 0.2, 0.8] | ||||
|  | ||||
|         if provider == "naive": | ||||
|             ms, min_ms, max_ms = triton.testing.do_bench( | ||||
|                 lambda: polynorm_naive(x, weight, bias), | ||||
|                 quantiles=quantiles, | ||||
|             ) | ||||
|         else: | ||||
|             ms, min_ms, max_ms = triton.testing.do_bench( | ||||
|                 lambda: polynorm_vllm(x, weight, bias), | ||||
|                 quantiles=quantiles, | ||||
|             ) | ||||
|  | ||||
|         return 1000 * ms, 1000 * max_ms, 1000 * min_ms | ||||
|  | ||||
|     return benchmark | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     import argparse | ||||
|  | ||||
|     parser = argparse.ArgumentParser() | ||||
|     parser.add_argument( | ||||
|         "--batch-size", | ||||
|         type=int, | ||||
|         default=4, | ||||
|         help="Batch size", | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "--seq-len", | ||||
|         type=int, | ||||
|         default=128, | ||||
|         help="Sequence length", | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "--hidden-dim", | ||||
|         type=int, | ||||
|         default=8192, | ||||
|         help="Intermediate size of MLP", | ||||
|     ) | ||||
|     parser.add_argument( | ||||
|         "--save-path", | ||||
|         type=str, | ||||
|         default="./configs/polnorm/", | ||||
|         help="Path to save polnorm benchmark results", | ||||
|     ) | ||||
|  | ||||
|     args = parser.parse_args() | ||||
|  | ||||
|     # Run correctness test | ||||
|     calculate_diff( | ||||
|         batch_size=args.batch_size, | ||||
|         seq_len=args.seq_len, | ||||
|         hidden_dim=args.hidden_dim, | ||||
|     ) | ||||
|  | ||||
|     benchmark = get_benchmark() | ||||
|     # Run performance benchmark | ||||
|     benchmark.run(print_data=True, save_path=args.save_path) | ||||
| @ -7,7 +7,8 @@ import torch | ||||
|  | ||||
| from vllm import _custom_ops as ops | ||||
| from vllm.platforms import current_platform | ||||
| from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser | ||||
| from vllm.utils import FlexibleArgumentParser | ||||
| from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE | ||||
|  | ||||
|  | ||||
| @torch.inference_mode() | ||||
|  | ||||
							
								
								
									
										172
									
								
								benchmarks/kernels/benchmark_reshape_and_cache.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										172
									
								
								benchmarks/kernels/benchmark_reshape_and_cache.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,172 @@ | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
| import random | ||||
| import time | ||||
|  | ||||
| import torch | ||||
| from tabulate import tabulate | ||||
|  | ||||
| from vllm import _custom_ops as ops | ||||
| from vllm.logger import init_logger | ||||
| from vllm.platforms import current_platform | ||||
| from vllm.utils import FlexibleArgumentParser | ||||
| from vllm.utils.torch_utils import ( | ||||
|     STR_DTYPE_TO_TORCH_DTYPE, | ||||
|     create_kv_caches_with_random, | ||||
| ) | ||||
|  | ||||
| logger = init_logger(__name__) | ||||
|  | ||||
|  | ||||
| @torch.inference_mode() | ||||
| def run_benchmark( | ||||
|     num_tokens: int, | ||||
|     num_heads: int, | ||||
|     head_size: int, | ||||
|     block_size: int, | ||||
|     num_blocks: int, | ||||
|     dtype: torch.dtype, | ||||
|     kv_cache_dtype: str, | ||||
|     num_iters: int, | ||||
|     benchmark_mode: str, | ||||
|     device: str = "cuda", | ||||
| ) -> float: | ||||
|     """Return latency (seconds) for given num_tokens.""" | ||||
|  | ||||
|     if kv_cache_dtype == "fp8" and head_size % 16: | ||||
|         raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.") | ||||
|  | ||||
|     current_platform.seed_everything(42) | ||||
|     torch.set_default_device(device) | ||||
|  | ||||
|     # create random key / value tensors [T, H, D]. | ||||
|     key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device) | ||||
|     value = torch.randn_like(key) | ||||
|  | ||||
|     # prepare the slot mapping. | ||||
|     # each token is assigned a unique slot in the KV-cache. | ||||
|     num_slots = block_size * num_blocks | ||||
|     if num_tokens > num_slots: | ||||
|         raise ValueError("num_tokens cannot exceed the total number of cache slots") | ||||
|     slot_mapping_lst = random.sample(range(num_slots), num_tokens) | ||||
|     slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device) | ||||
|  | ||||
|     key_caches, value_caches = create_kv_caches_with_random( | ||||
|         num_blocks, | ||||
|         block_size, | ||||
|         1,  # num_layers | ||||
|         num_heads, | ||||
|         head_size, | ||||
|         kv_cache_dtype, | ||||
|         dtype, | ||||
|         device=device, | ||||
|     ) | ||||
|     key_cache, value_cache = key_caches[0], value_caches[0] | ||||
|     # to free unused memory | ||||
|     del key_caches, value_caches | ||||
|  | ||||
|     # compute per-kernel scaling factors for fp8 conversion (if used). | ||||
|     k_scale = (key.amax() / 64.0).to(torch.float32) | ||||
|     v_scale = (value.amax() / 64.0).to(torch.float32) | ||||
|  | ||||
|     function_under_test = lambda: ops.reshape_and_cache( | ||||
|         key,  # noqa: F821 | ||||
|         value,  # noqa: F821 | ||||
|         key_cache,  # noqa: F821 | ||||
|         value_cache,  # noqa: F821 | ||||
|         slot_mapping,  # noqa: F821 | ||||
|         kv_cache_dtype, | ||||
|         k_scale, | ||||
|         v_scale, | ||||
|     ) | ||||
|  | ||||
|     if benchmark_mode == "cudagraph": | ||||
|         g = torch.cuda.CUDAGraph() | ||||
|         with torch.cuda.graph(g): | ||||
|             function_under_test() | ||||
|         torch.cuda.synchronize() | ||||
|         function_under_test = lambda: g.replay() | ||||
|  | ||||
|     def run_cuda_benchmark(n_iters: int) -> float: | ||||
|         nonlocal key, value, key_cache, value_cache, slot_mapping | ||||
|         torch.cuda.synchronize() | ||||
|         start = time.perf_counter() | ||||
|         for _ in range(n_iters): | ||||
|             function_under_test() | ||||
|             torch.cuda.synchronize() | ||||
|         end = time.perf_counter() | ||||
|         return (end - start) / n_iters | ||||
|  | ||||
|     # warm-up | ||||
|     run_cuda_benchmark(3) | ||||
|  | ||||
|     lat = run_cuda_benchmark(num_iters) | ||||
|  | ||||
|     # free tensors to mitigate OOM when sweeping | ||||
|     del key, value, key_cache, value_cache, slot_mapping | ||||
|     torch.cuda.empty_cache() | ||||
|  | ||||
|     return lat | ||||
|  | ||||
|  | ||||
| def main(args): | ||||
|     rows = [] | ||||
|     for exp in range(1, 17): | ||||
|         n_tok = 2**exp | ||||
|         lat = run_benchmark( | ||||
|             num_tokens=n_tok, | ||||
|             num_heads=args.num_heads, | ||||
|             head_size=args.head_size, | ||||
|             block_size=args.block_size, | ||||
|             num_blocks=args.num_blocks, | ||||
|             dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype], | ||||
|             kv_cache_dtype=args.kv_cache_dtype, | ||||
|             num_iters=args.iters, | ||||
|             benchmark_mode=args.mode, | ||||
|             device="cuda", | ||||
|         ) | ||||
|         rows.append([n_tok, lat * 1e6])  # convert to microseconds | ||||
|  | ||||
|     print(f"Benchmark results for implementation cuda (measuring with {args.mode}):") | ||||
|     print(tabulate(rows, headers=["num_tokens", "latency (µs)"], floatfmt=".3f")) | ||||
|  | ||||
|  | ||||
| if __name__ == "__main__": | ||||
|     parser = FlexibleArgumentParser() | ||||
|  | ||||
|     parser.add_argument("--num-heads", type=int, default=128) | ||||
|     parser.add_argument( | ||||
|         "--head-size", | ||||
|         type=int, | ||||
|         choices=[64, 80, 96, 112, 120, 128, 192, 256], | ||||
|         default=128, | ||||
|     ) | ||||
|     parser.add_argument("--block-size", type=int, choices=[16, 32], default=16) | ||||
|     parser.add_argument("--num-blocks", type=int, default=128 * 128) | ||||
|  | ||||
|     parser.add_argument( | ||||
|         "--dtype", | ||||
|         type=str, | ||||
|         choices=["half", "bfloat16", "float"], | ||||
|         default="bfloat16", | ||||
|     ) | ||||
|  | ||||
|     parser.add_argument( | ||||
|         "--kv-cache-dtype", | ||||
|         type=str, | ||||
|         choices=["auto", "fp8"], | ||||
|         default="auto", | ||||
|     ) | ||||
|  | ||||
|     parser.add_argument("--iters", type=int, default=200) | ||||
|  | ||||
|     parser.add_argument( | ||||
|         "--mode", | ||||
|         type=str, | ||||
|         choices=["cudagraph", "no_graph"], | ||||
|         default="cudagraph", | ||||
|     ) | ||||
|  | ||||
|     args = parser.parse_args() | ||||
|  | ||||
|     main(args) | ||||
| @ -1,7 +1,5 @@ | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
| from __future__ import annotations | ||||
|  | ||||
| import random | ||||
| import time | ||||
|  | ||||
| @ -14,9 +12,9 @@ from vllm.attention.ops.triton_reshape_and_cache_flash import ( | ||||
| ) | ||||
| from vllm.logger import init_logger | ||||
| from vllm.platforms import current_platform | ||||
| from vllm.utils import ( | ||||
| from vllm.utils import FlexibleArgumentParser | ||||
| from vllm.utils.torch_utils import ( | ||||
|     STR_DTYPE_TO_TORCH_DTYPE, | ||||
|     FlexibleArgumentParser, | ||||
|     create_kv_caches_with_random_flash, | ||||
| ) | ||||
|  | ||||
|  | ||||
| @ -2,7 +2,6 @@ | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
|  | ||||
| import itertools | ||||
| from typing import Optional, Union | ||||
|  | ||||
| import torch | ||||
| from flashinfer.norm import fused_add_rmsnorm, rmsnorm | ||||
| @ -21,8 +20,8 @@ class HuggingFaceRMSNorm(nn.Module): | ||||
|     def forward( | ||||
|         self, | ||||
|         x: torch.Tensor, | ||||
|         residual: Optional[torch.Tensor] = None, | ||||
|     ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]: | ||||
|         residual: torch.Tensor | None = None, | ||||
|     ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: | ||||
|         orig_dtype = x.dtype | ||||
|         x = x.to(torch.float32) | ||||
|         if residual is not None: | ||||
| @ -41,7 +40,7 @@ class HuggingFaceRMSNorm(nn.Module): | ||||
| def rmsnorm_naive( | ||||
|     x: torch.Tensor, | ||||
|     weight: torch.Tensor, | ||||
|     residual: Optional[torch.Tensor] = None, | ||||
|     residual: torch.Tensor | None = None, | ||||
|     eps: float = 1e-6, | ||||
| ): | ||||
|     naive_norm = HuggingFaceRMSNorm(x.shape[-1], eps=eps) | ||||
| @ -65,7 +64,7 @@ def rmsnorm_naive( | ||||
| def rmsnorm_flashinfer( | ||||
|     x: torch.Tensor, | ||||
|     weight: torch.Tensor, | ||||
|     residual: Optional[torch.Tensor] = None, | ||||
|     residual: torch.Tensor | None = None, | ||||
|     eps: float = 1e-6, | ||||
| ): | ||||
|     orig_shape = x.shape | ||||
| @ -89,7 +88,7 @@ def rmsnorm_flashinfer( | ||||
| def rmsnorm_vllm( | ||||
|     x: torch.Tensor, | ||||
|     weight: torch.Tensor, | ||||
|     residual: Optional[torch.Tensor] = None, | ||||
|     residual: torch.Tensor | None = None, | ||||
|     eps: float = 1e-6, | ||||
| ): | ||||
|     orig_shape = x.shape | ||||
|  | ||||
| @ -2,7 +2,6 @@ | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
|  | ||||
| from itertools import accumulate | ||||
| from typing import Optional | ||||
|  | ||||
| import nvtx | ||||
| import torch | ||||
| @ -18,7 +17,7 @@ def benchmark_rope_kernels_multi_lora( | ||||
|     seq_len: int, | ||||
|     num_heads: int, | ||||
|     head_size: int, | ||||
|     rotary_dim: Optional[int], | ||||
|     rotary_dim: int | None, | ||||
|     dtype: torch.dtype, | ||||
|     seed: int, | ||||
|     device: str, | ||||
|  | ||||
| @ -1,5 +1,19 @@ | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
|  | ||||
| """ | ||||
| Comprehensive 3-way SiLU Benchmark Suite | ||||
|  | ||||
| This benchmark compares three SiLU implementations: | ||||
| 1. SiLU V2 (CUDA) - Optimized CUDA kernel implementation | ||||
| 2. Triton Kernel - Triton-based implementation | ||||
|  | ||||
| The suite generates detailed performance comparisons including: | ||||
| - Memory bandwidth utilization | ||||
| - Speedup ratios (baseline vs optimized implementations) | ||||
| - Performance across different expert configurations and token distributions | ||||
| """ | ||||
|  | ||||
| from collections.abc import Callable | ||||
|  | ||||
| import matplotlib.pyplot as plt | ||||
| @ -7,7 +21,7 @@ import numpy as np | ||||
| import torch | ||||
|  | ||||
| from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( | ||||
|     silu_mul_fp8_quant_deep_gemm_cuda, | ||||
|     persistent_masked_m_silu_mul_quant, | ||||
| ) | ||||
| from vllm.platforms import current_platform | ||||
| from vllm.triton_utils import tl, triton | ||||
| @ -94,6 +108,7 @@ def silu_mul_fp8_quant_deep_gemm_triton( | ||||
|     num_parallel_tokens, | ||||
|     group_size: int = 128, | ||||
|     eps: float = 1e-10, | ||||
|     expert_offsets: torch.Tensor = None, | ||||
| ) -> tuple[torch.Tensor, torch.Tensor]: | ||||
|     """Quantize silu(y[..., :H]) * y[..., H:] to FP8 with group per-token scales | ||||
|  | ||||
| @ -174,7 +189,7 @@ def silu_mul_fp8_quant_deep_gemm_triton( | ||||
|  | ||||
|  | ||||
| # Parse generation strategies | ||||
| strategies = ["uniform", "max_t", "first_t"] | ||||
| strategies = ["random_imbalanced", "uniform", "max_t"] | ||||
|  | ||||
|  | ||||
| def benchmark( | ||||
| @ -195,15 +210,27 @@ def benchmark( | ||||
|         current_platform.seed_everything(42 + seed_offset) | ||||
|         y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous() | ||||
|  | ||||
|         if gen_strategy == "uniform": | ||||
|             r = torch.rand(size=(E,), device="cuda") | ||||
|         if gen_strategy == "random_imbalanced": | ||||
|  | ||||
|             def generate_expert_loads(n_e, total_tokens, ratio, device="cuda"): | ||||
|                 mean = total_tokens // n_e | ||||
|                 min_max = mean // ratio | ||||
|                 e = torch.ones(size=(E,), dtype=torch.int64, device=device) * mean | ||||
|                 e[0] = min_max | ||||
|                 r = torch.rand(size=(E - 1,)) | ||||
|                 r /= r.sum() | ||||
|                 r *= total_tokens - min_max | ||||
|                 r = r.round().long() | ||||
|                 e[1:] = r.to(device=device) | ||||
|                 return e | ||||
|  | ||||
|             tokens_per_expert = generate_expert_loads(E, total_tokens, 0.7, "cuda") | ||||
|         elif gen_strategy == "uniform": | ||||
|             r = torch.rand(size=(E,)) | ||||
|             r /= r.sum() | ||||
|             r *= total_tokens | ||||
|             tokens_per_expert = r.int() | ||||
|             tokens_per_expert = torch.minimum( | ||||
|                 tokens_per_expert, | ||||
|                 torch.ones((E,), device=r.device, dtype=torch.int) * T, | ||||
|             ) | ||||
|             r = r.round().long() | ||||
|             tokens_per_expert = r | ||||
|         elif gen_strategy == "max_t": | ||||
|             tokens_per_expert = torch.empty(size=(E,), dtype=torch.int32, device="cuda") | ||||
|             tokens_per_expert.fill_(total_tokens / E) | ||||
| @ -281,40 +308,34 @@ def benchmark( | ||||
|  | ||||
|  | ||||
| def create_comparison_plot( | ||||
|     ratio, cuda_times, baseline_times, config_labels, strategy_name, id | ||||
|     ratios, silu_v2_times, triton_times, config_labels, strategy_name, id | ||||
| ): | ||||
|     """Create a comparison plot for a specific generation strategy""" | ||||
|     fig, ax = plt.subplots(1, 1, figsize=(16, 6)) | ||||
|     fig, ax = plt.subplots(1, 1, figsize=(18, 6)) | ||||
|  | ||||
|     # Configure x-axis positions | ||||
|     x = np.arange(len(config_labels)) | ||||
|     width = 0.35 | ||||
|     width = 0.25 | ||||
|  | ||||
|     # Execution Time plot (lower is better) | ||||
|     ax.bar(x, silu_v2_times, width, label="SiLU V2 (CUDA)", alpha=0.8, color="blue") | ||||
|     ax.bar( | ||||
|         x - width / 2, cuda_times, width, label="CUDA Kernel", alpha=0.8, color="blue" | ||||
|     ) | ||||
|     ax.bar( | ||||
|         x + width / 2, | ||||
|         baseline_times, | ||||
|         width, | ||||
|         label="Baseline", | ||||
|         alpha=0.8, | ||||
|         color="orange", | ||||
|         x + width, triton_times, width, label="Triton Kernel", alpha=0.8, color="green" | ||||
|     ) | ||||
|  | ||||
|     # Add speedup labels over each bar pair | ||||
|     # Add speedup labels over each bar trio | ||||
|     for i in range(len(x)): | ||||
|         speedup = ratio[i] | ||||
|         max_height = max(cuda_times[i], baseline_times[i]) | ||||
|         triton_v2_speedup = ratios[i][1]  # triton/v2 | ||||
|         max_height = max(silu_v2_times[i], triton_times[i]) | ||||
|  | ||||
|         # Triton/V2 speedup | ||||
|         ax.text( | ||||
|             x[i], | ||||
|             x[i] + width / 2, | ||||
|             max_height + max_height * 0.02, | ||||
|             f"{speedup:.2f}x", | ||||
|             f"{triton_v2_speedup:.2f}x", | ||||
|             ha="center", | ||||
|             va="bottom", | ||||
|             fontweight="bold", | ||||
|             fontsize=9, | ||||
|             fontsize=8, | ||||
|         ) | ||||
|  | ||||
|     ax.set_xlabel("Configuration") | ||||
| @ -332,56 +353,75 @@ def create_comparison_plot( | ||||
|  | ||||
|  | ||||
| def create_combined_plot(all_results): | ||||
|     """Create a combined plot with all strategies in one PNG""" | ||||
|     num_strategies = len(all_results) | ||||
|     fig, axes = plt.subplots(num_strategies, 1, figsize=(20, 6 * num_strategies)) | ||||
|     fig, axes = plt.subplots(num_strategies, 1, figsize=(22, 7 * num_strategies)) | ||||
|  | ||||
|     if num_strategies == 1: | ||||
|         axes = [axes] | ||||
|  | ||||
|     for idx, ( | ||||
|         strategy_name, | ||||
|         ratio, | ||||
|         cuda_times, | ||||
|         baseline_times, | ||||
|         all_ratios, | ||||
|         all_silu_v2_results, | ||||
|         all_triton_results, | ||||
|         config_labels, | ||||
|         config_x_axis, | ||||
|     ) in enumerate(all_results): | ||||
|         ax = axes[idx] | ||||
|  | ||||
|         # Flatten the nested results to get bandwidth percentages for plotting | ||||
|         silu_v2_bandwidths = [] | ||||
|         triton_bandwidths = [] | ||||
|         flat_ratios = [] | ||||
|  | ||||
|         for config_results in all_silu_v2_results: | ||||
|             for result in config_results: | ||||
|                 silu_v2_bandwidths.append(result[3])  # bandwidth percentage | ||||
|  | ||||
|         for config_results in all_triton_results: | ||||
|             for result in config_results: | ||||
|                 triton_bandwidths.append(result[3])  # bandwidth percentage | ||||
|  | ||||
|         for config_ratios in all_ratios: | ||||
|             for ratio in config_ratios: | ||||
|                 flat_ratios.append(ratio) | ||||
|  | ||||
|         # Configure x-axis positions | ||||
|         x = np.arange(len(config_labels)) | ||||
|         width = 0.35 | ||||
|         width = 0.25 | ||||
|  | ||||
|         # Execution Time plot (lower is better) | ||||
|         # Bandwidth utilization plot (higher is better) | ||||
|         ax.bar( | ||||
|             x - width / 2, | ||||
|             cuda_times, | ||||
|             x, | ||||
|             silu_v2_bandwidths, | ||||
|             width, | ||||
|             label="CUDA Kernel", | ||||
|             label="SiLU V2 (CUDA)", | ||||
|             alpha=0.8, | ||||
|             color="blue", | ||||
|         ) | ||||
|         ax.bar( | ||||
|             x + width / 2, | ||||
|             baseline_times, | ||||
|             x + width, | ||||
|             triton_bandwidths, | ||||
|             width, | ||||
|             label="Baseline", | ||||
|             label="Triton Kernel", | ||||
|             alpha=0.8, | ||||
|             color="orange", | ||||
|             color="green", | ||||
|         ) | ||||
|  | ||||
|         # Add speedup labels over each bar pair | ||||
|         # Add speedup labels over each bar trio | ||||
|         for i in range(len(x)): | ||||
|             speedup = ratio[i] | ||||
|             max_height = max(cuda_times[i], baseline_times[i]) | ||||
|             triton_v2_speedup = flat_ratios[i]  # triton/v2 | ||||
|             max_height = max(silu_v2_bandwidths[i], triton_bandwidths[i]) | ||||
|  | ||||
|             # Triton/V2 speedup | ||||
|             ax.text( | ||||
|                 x[i], | ||||
|                 x[i] + width / 2, | ||||
|                 max_height + max_height * 0.02, | ||||
|                 f"{speedup:.2f}x", | ||||
|                 f"{triton_v2_speedup:.2f}x", | ||||
|                 ha="center", | ||||
|                 va="bottom", | ||||
|                 fontweight="bold", | ||||
|                 fontsize=9, | ||||
|                 fontsize=8, | ||||
|             ) | ||||
|  | ||||
|         ax.set_xlabel("Configuration") | ||||
| @ -395,7 +435,7 @@ def create_combined_plot(all_results): | ||||
|         ax.grid(True, alpha=0.3) | ||||
|  | ||||
|     plt.tight_layout() | ||||
|     filename = "../../silu_bench/silu_benchmark_combined.png" | ||||
|     filename = "silu_benchmark_combined_3way.png" | ||||
|     plt.savefig(filename, dpi=300, bbox_inches="tight") | ||||
|     plt.show() | ||||
|  | ||||
| @ -405,7 +445,9 @@ def create_combined_plot(all_results): | ||||
| outer_dim = 7168 | ||||
| configs = [ | ||||
|     # DeepSeekV3 Configs | ||||
|     # (1, 56, 7168), | ||||
|     (8, 1024, 7168), | ||||
|     # (32, 56, 7168), | ||||
|     # DeepSeekV3 Configs | ||||
|     (32, 1024, 7168), | ||||
|     # DeepSeekV3 Configs | ||||
| @ -417,6 +459,7 @@ num_warmups = 20 | ||||
|  | ||||
| strategy_descriptions = { | ||||
|     "uniform": "Uniform Random", | ||||
|     "random_imbalanced": "Imbalanced Random", | ||||
|     "max_t": "Even Assignment", | ||||
|     "first_t": "experts[0] = T, experts[1:] = 0", | ||||
| } | ||||
| @ -433,28 +476,31 @@ for id, strategy in enumerate(strategies): | ||||
|     print(f"Testing strategy: {strategy_descriptions[strategy]}") | ||||
|     print(f"{'=' * 60}") | ||||
|  | ||||
|     # Collect benchmark data for both algorithms | ||||
|     # Collect benchmark data for all three algorithms | ||||
|     config_labels = [] | ||||
|     config_x_axis = [] | ||||
|     all_cuda_results = [] | ||||
|     all_baseline_results = [] | ||||
|     all_silu_v2_results = [] | ||||
|     all_triton_results = [] | ||||
|     all_ratios = [] | ||||
|  | ||||
|     for E, T, H in configs: | ||||
|         total_tokens_config = [8 * E, 16 * E, 32 * E, 64 * E, 128 * E, 256 * E] | ||||
|         total_tokens_config = [] | ||||
|         for i in [8, 16, 32, 64, 128, 256, 512]: | ||||
|             if i <= T: | ||||
|                 total_tokens_config.append(i * E) | ||||
|         config_x_axis.append(total_tokens_config) | ||||
|  | ||||
|         cuda_results = [] | ||||
|         baseline_results = [] | ||||
|         silu_v2_results = [] | ||||
|         triton_results = [] | ||||
|         ratios = [] | ||||
|  | ||||
|         for total_tokens in total_tokens_config: | ||||
|             config_label = f"E={E},T={T},H={H},TT={total_tokens}" | ||||
|             config_labels.append(config_label) | ||||
|  | ||||
|             # CUDA kernel results | ||||
|             time_ms_cuda, gflops, gbps, perc = benchmark( | ||||
|                 silu_mul_fp8_quant_deep_gemm_cuda, | ||||
|             # SiLU V2 (CUDA kernel) results | ||||
|             time_ms_silu_v2, gflops, gbps, perc = benchmark( | ||||
|                 persistent_masked_m_silu_mul_quant, | ||||
|                 E, | ||||
|                 T, | ||||
|                 H, | ||||
| @ -463,9 +509,9 @@ for id, strategy in enumerate(strategies): | ||||
|                 num_warmups=num_warmups, | ||||
|                 gen_strategy=strategy, | ||||
|             ) | ||||
|             cuda_results.append((time_ms_cuda, gflops, gbps, perc)) | ||||
|             silu_v2_results.append((time_ms_silu_v2, gflops, gbps, perc)) | ||||
|  | ||||
|             # Baseline results | ||||
|             # Triton kernel results | ||||
|             time_ms_triton, gflops, gbps, perc = benchmark( | ||||
|                 silu_mul_fp8_quant_deep_gemm_triton, | ||||
|                 E, | ||||
| @ -476,12 +522,20 @@ for id, strategy in enumerate(strategies): | ||||
|                 num_warmups=num_warmups, | ||||
|                 gen_strategy=strategy, | ||||
|             ) | ||||
|             baseline_results.append((time_ms_triton, gflops, gbps, perc)) | ||||
|             ratios.append(time_ms_triton / time_ms_cuda) | ||||
|             triton_results.append((time_ms_triton, gflops, gbps, perc)) | ||||
|  | ||||
|             print(f"Completed: {config_label}") | ||||
|         all_cuda_results.append(cuda_results) | ||||
|         all_baseline_results.append(baseline_results) | ||||
|             # Calculate speedup ratios (triton baseline / implementation) | ||||
|             triton_v2_ratio = time_ms_triton / time_ms_silu_v2 | ||||
|             ratios.append(triton_v2_ratio) | ||||
|  | ||||
|             print( | ||||
|                 f"Completed: {config_label}:" | ||||
|                 f" V2: {time_ms_silu_v2:.3f}ms," | ||||
|                 f" Triton: {time_ms_triton:.3f}ms" | ||||
|             ) | ||||
|  | ||||
|         all_silu_v2_results.append(silu_v2_results) | ||||
|         all_triton_results.append(triton_results) | ||||
|         all_ratios.append(ratios) | ||||
|  | ||||
|     # Store results for combined plotting | ||||
| @ -489,8 +543,8 @@ for id, strategy in enumerate(strategies): | ||||
|         ( | ||||
|             strategy_descriptions[strategy], | ||||
|             all_ratios, | ||||
|             all_cuda_results, | ||||
|             all_baseline_results, | ||||
|             all_silu_v2_results, | ||||
|             all_triton_results, | ||||
|             config_labels, | ||||
|             config_x_axis, | ||||
|         ) | ||||
| @ -498,15 +552,18 @@ for id, strategy in enumerate(strategies): | ||||
|  | ||||
|     # Print summary table for this strategy | ||||
|     print(f"\nSummary Table - {strategy_descriptions[strategy]}:") | ||||
|     print(f"{'Config':<20} {'CUDA Time(ms)':<12} {'Base Time(ms)':<12} {'Speedup':<8}") | ||||
|     print("-" * 60) | ||||
|     print(f" {'V2 Time(ms)':<12} {'Triton Time(ms)':<14} {'Triton/V2':<10}") | ||||
|     print("-" * 90) | ||||
|  | ||||
|     for i, (E, T, H) in enumerate(configs): | ||||
|         speedup = baseline_results[i][0] / cuda_results[i][0] | ||||
|         # Get the first result for each config (simplifying for summary) | ||||
|         v2_time = silu_v2_results[i][0] | ||||
|         triton_time = triton_results[i][0] | ||||
|         triton_v2_speedup = triton_time / v2_time | ||||
|         config_label = f"E={E:3d},T={T:4d},H={H:4d}" | ||||
|         print( | ||||
|             f"{config_label:<20} {cuda_results[i][0]:8.5f} " | ||||
|             f"{baseline_results[i][0]:8.5f} {speedup:6.2f}x" | ||||
|             f"{config_label:<20} {v2_time:8.5f} {triton_time:10.5f} " | ||||
|             f"{triton_v2_speedup:8.2f}x" | ||||
|         ) | ||||
|  | ||||
|  | ||||
| @ -514,15 +571,14 @@ def create_total_tokens_plot(all_results): | ||||
|     num_strategies = len(all_results) | ||||
|     num_configs = len(configs) | ||||
|  | ||||
|     # Create side-by-side subplots: 2 columns for speedup and bandwidth percentage | ||||
|     fig, axs = plt.subplots( | ||||
|         num_strategies, num_configs * 2, figsize=(28, 6 * num_strategies) | ||||
|         num_strategies, num_configs * 2, figsize=(32, 8 * num_strategies) | ||||
|     ) | ||||
|  | ||||
|     # Add main title to the entire figure | ||||
|     fig.suptitle( | ||||
|         "Performance Analysis: Speedup vs Bandwidth Utilization (Triton & CUDA)", | ||||
|         fontsize=16, | ||||
|         "Performance Analysis: Speedup vs Bandwidth Utilization (SiLU V2, and Triton)", | ||||
|         fontsize=18, | ||||
|         fontweight="bold", | ||||
|         y=0.98, | ||||
|     ) | ||||
| @ -539,8 +595,8 @@ def create_total_tokens_plot(all_results): | ||||
|         ( | ||||
|             strategy_name, | ||||
|             all_ratios, | ||||
|             all_cuda_results, | ||||
|             all_baseline_results, | ||||
|             all_silu_v2_results, | ||||
|             all_triton_results, | ||||
|             config_labels, | ||||
|             config_x_axis, | ||||
|         ) = result | ||||
| @ -555,42 +611,54 @@ def create_total_tokens_plot(all_results): | ||||
|             ratios = all_ratios[config_idx] | ||||
|             total_tokens_values = config_x_axis[config_idx] | ||||
|  | ||||
|             # Extract CUDA and Triton bandwidth percentages | ||||
|             cuda_bandwidth_percentages = [ | ||||
|                 result[3] for result in all_cuda_results[config_idx] | ||||
|             # Extract speedup ratios | ||||
|             triton_v2_ratios = [ratio for ratio in ratios] | ||||
|  | ||||
|             # Extract bandwidth percentages for all implementations | ||||
|             v2_bandwidth_percentages = [ | ||||
|                 result[3] for result in all_silu_v2_results[config_idx] | ||||
|             ] | ||||
|             triton_bandwidth_percentages = [ | ||||
|                 result[3] for result in all_baseline_results[config_idx] | ||||
|                 result[3] for result in all_triton_results[config_idx] | ||||
|             ] | ||||
|  | ||||
|             # Plot speedup ratios vs total tokens (left plot) | ||||
|             ax_speedup.plot( | ||||
|                 total_tokens_values, ratios, "bo-", linewidth=3, markersize=8 | ||||
|                 total_tokens_values, | ||||
|                 triton_v2_ratios, | ||||
|                 "go-", | ||||
|                 linewidth=3, | ||||
|                 markersize=8, | ||||
|                 label="Triton/V2 Speedup", | ||||
|             ) | ||||
|             ax_speedup.set_title( | ||||
|                 f"{strategy_name}\nSpeedup (CUDA/Triton)\nE={E}, T={T}, H={H}", | ||||
|                 f"{strategy_name}\nSpeedup vs Baseline (Triton)\nE={E}, T={T}, H={H}", | ||||
|                 fontsize=12, | ||||
|                 fontweight="bold", | ||||
|             ) | ||||
|             ax_speedup.set_xlabel("Total Tokens", fontweight="bold", fontsize=11) | ||||
|             ax_speedup.set_ylabel("Speedup Ratio", fontweight="bold", fontsize=11) | ||||
|             ax_speedup.legend(prop={"weight": "bold"}) | ||||
|             ax_speedup.grid(True, alpha=0.3) | ||||
|  | ||||
|             # Plot bandwidth utilization (right plot) | ||||
|             ax_bandwidth.plot( | ||||
|                 total_tokens_values, | ||||
|                 cuda_bandwidth_percentages, | ||||
|                 "ro-", | ||||
|                 v2_bandwidth_percentages, | ||||
|                 "o-", | ||||
|                 linewidth=3, | ||||
|                 markersize=8, | ||||
|                 label="CUDA", | ||||
|                 label="SiLU V2", | ||||
|                 color="blue", | ||||
|             ) | ||||
|             ax_bandwidth.plot( | ||||
|                 total_tokens_values, | ||||
|                 triton_bandwidth_percentages, | ||||
|                 "go-", | ||||
|                 "o-", | ||||
|                 linewidth=3, | ||||
|                 markersize=8, | ||||
|                 label="Triton", | ||||
|                 color="green", | ||||
|             ) | ||||
|             ax_bandwidth.set_title( | ||||
|                 f"{strategy_name}\nBandwidth Utilization (Hopper)\nE={E}, T={T}, H={H}", | ||||
| @ -618,38 +686,12 @@ def create_total_tokens_plot(all_results): | ||||
|                 for label in ax.get_xticklabels() + ax.get_yticklabels(): | ||||
|                     label.set_fontweight("bold") | ||||
|  | ||||
|             # Add value labels on speedup points | ||||
|             for x, y in zip(total_tokens_values, ratios): | ||||
|             # Add value labels on Triton/V2 speedup points | ||||
|             for x, y in zip(total_tokens_values, triton_v2_ratios): | ||||
|                 ax_speedup.annotate( | ||||
|                     f"{y:.2f}x", | ||||
|                     (x, y), | ||||
|                     textcoords="offset points", | ||||
|                     xytext=(0, 12), | ||||
|                     ha="center", | ||||
|                     fontsize=10, | ||||
|                     fontweight="bold", | ||||
|                     bbox=dict(boxstyle="round,pad=0.3", facecolor="white", alpha=0.7), | ||||
|                 ) | ||||
|  | ||||
|             # Add value labels on CUDA bandwidth points | ||||
|             for x, y in zip(total_tokens_values, cuda_bandwidth_percentages): | ||||
|                 ax_bandwidth.annotate( | ||||
|                     f"{y:.1f}%", | ||||
|                     (x, y), | ||||
|                     textcoords="offset points", | ||||
|                     xytext=(0, 12), | ||||
|                     ha="center", | ||||
|                     fontsize=9, | ||||
|                     fontweight="bold", | ||||
|                     bbox=dict(boxstyle="round,pad=0.2", facecolor="red", alpha=0.3), | ||||
|                 ) | ||||
|  | ||||
|             # Add value labels on Triton bandwidth points | ||||
|             for x, y in zip(total_tokens_values, triton_bandwidth_percentages): | ||||
|                 ax_bandwidth.annotate( | ||||
|                     f"{y:.1f}%", | ||||
|                     (x, y), | ||||
|                     textcoords="offset points", | ||||
|                     xytext=(0, -15), | ||||
|                     ha="center", | ||||
|                     fontsize=9, | ||||
| @ -659,17 +701,20 @@ def create_total_tokens_plot(all_results): | ||||
|  | ||||
|     plt.tight_layout() | ||||
|     plt.subplots_adjust(top=0.93)  # Make room for main title | ||||
|     filename = "silu_benchmark_total_tokens.png" | ||||
|     filename = "silu_benchmark_total_tokens_3way.png" | ||||
|     plt.savefig(filename, dpi=300, bbox_inches="tight") | ||||
|     plt.show() | ||||
|  | ||||
|     return filename | ||||
|  | ||||
|  | ||||
| # Create combined plot with all strategies | ||||
| combined_plot_filename = create_total_tokens_plot(all_results) | ||||
| # Create comprehensive 3-way comparison plots | ||||
| combined_plot_filename = create_combined_plot(all_results) | ||||
| total_tokens_plot_filename = create_total_tokens_plot(all_results) | ||||
|  | ||||
| print(f"\n{'=' * 60}") | ||||
| print("Benchmark Complete!") | ||||
| print(f"Generated combined plot: {combined_plot_filename}") | ||||
| print(f"{'=' * 60}") | ||||
| print(f"\n{'=' * 80}") | ||||
| print("3-Way Benchmark Suite Complete!") | ||||
| print(f"Generated combined comparison plot: {combined_plot_filename}") | ||||
| print(f"Generated total tokens analysis plot: {total_tokens_plot_filename}") | ||||
| print("Compared: SiLU V2 (CUDA), and Triton implementations") | ||||
| print(f"{'=' * 80}") | ||||
|  | ||||
| @ -4,7 +4,6 @@ | ||||
| import csv | ||||
| import os | ||||
| from datetime import datetime | ||||
| from typing import Optional | ||||
|  | ||||
| import flashinfer | ||||
| import torch | ||||
| @ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn): | ||||
| @torch.no_grad() | ||||
| def benchmark_decode( | ||||
|     dtype: torch.dtype, | ||||
|     quant_dtypes: tuple[ | ||||
|         Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype] | ||||
|     ], | ||||
|     quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None], | ||||
|     batch_size: int, | ||||
|     max_seq_len: int, | ||||
|     num_heads: tuple[int, int] = (64, 8), | ||||
|  | ||||
| @ -4,7 +4,6 @@ | ||||
| import csv | ||||
| import os | ||||
| from datetime import datetime | ||||
| from typing import Optional | ||||
|  | ||||
| import flashinfer | ||||
| import torch | ||||
| @ -28,9 +27,7 @@ def to_float8(x, dtype=torch.float8_e4m3fn): | ||||
| @torch.no_grad() | ||||
| def benchmark_prefill( | ||||
|     dtype: torch.dtype, | ||||
|     quant_dtypes: tuple[ | ||||
|         Optional[torch.dtype], Optional[torch.dtype], Optional[torch.dtype] | ||||
|     ], | ||||
|     quant_dtypes: tuple[torch.dtype | None, torch.dtype | None, torch.dtype | None], | ||||
|     batch_size: int, | ||||
|     max_seq_len: int, | ||||
|     num_heads: tuple[int, int] = (64, 8), | ||||
|  | ||||
| @ -14,7 +14,7 @@ import torch | ||||
| from tqdm import tqdm | ||||
|  | ||||
| from vllm.model_executor.layers.quantization.utils.fp8_utils import ( | ||||
|     _w8a8_block_fp8_matmul, | ||||
|     _w8a8_triton_block_scaled_mm, | ||||
| ) | ||||
| from vllm.platforms import current_platform | ||||
| from vllm.triton_utils import triton | ||||
| @ -83,7 +83,7 @@ def w8a8_block_matmul( | ||||
|         ) | ||||
|  | ||||
|     if A.dtype == torch.float8_e4m3fn: | ||||
|         kernel = _w8a8_block_fp8_matmul | ||||
|         kernel = _w8a8_triton_block_scaled_mm | ||||
|     else: | ||||
|         raise RuntimeError("Currently, only support tune w8a8 block fp8 kernel.") | ||||
|  | ||||
|  | ||||
| @ -1,6 +1,5 @@ | ||||
| # SPDX-License-Identifier: Apache-2.0 | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
| # fmt: off | ||||
| # ruff: noqa: E501 | ||||
| import time | ||||
|  | ||||
| @ -9,7 +8,7 @@ import torch | ||||
| from vllm import _custom_ops as ops | ||||
| from vllm.model_executor.layers.quantization.utils.fp8_utils import ( | ||||
|     per_token_group_quant_fp8, | ||||
|     w8a8_block_fp8_matmul, | ||||
|     w8a8_triton_block_scaled_mm, | ||||
| ) | ||||
| from vllm.triton_utils import triton | ||||
| from vllm.utils.deep_gemm import ( | ||||
| @ -20,19 +19,21 @@ from vllm.utils.deep_gemm import ( | ||||
| ) | ||||
|  | ||||
|  | ||||
| def benchmark_shape(m: int, | ||||
|                     n: int, | ||||
|                     k: int, | ||||
|                     warmup: int = 100, | ||||
|                     repeat: int = 10000, | ||||
|                     verbose: bool = False) -> dict: | ||||
| def benchmark_shape( | ||||
|     m: int, | ||||
|     n: int, | ||||
|     k: int, | ||||
|     warmup: int = 100, | ||||
|     repeat: int = 10000, | ||||
|     verbose: bool = False, | ||||
| ) -> dict: | ||||
|     """Benchmark all implementations for a specific (m, n, k) shape.""" | ||||
|     if verbose: | ||||
|         print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===") | ||||
|  | ||||
|     # Create test tensors | ||||
|     A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16) | ||||
|     B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16) | ||||
|     A = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) | ||||
|     B = torch.randn((n, k), device="cuda", dtype=torch.bfloat16) | ||||
|  | ||||
|     # Reference result in BF16 | ||||
|     torch.cuda.synchronize() | ||||
| @ -49,34 +50,39 @@ def benchmark_shape(m: int, | ||||
|     # Pre-quantize A for all implementations | ||||
|     A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1]) | ||||
|     A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm) | ||||
|     C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16) | ||||
|     C_deepgemm = torch.empty((m, n), device="cuda", dtype=torch.bfloat16) | ||||
|     A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1]) | ||||
|     A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8( | ||||
|         A, block_size[1], column_major_scales=True) | ||||
|         A, block_size[1], column_major_scales=True | ||||
|     ) | ||||
|  | ||||
|     # === DeepGEMM Implementation === | ||||
|     def deepgemm_gemm(): | ||||
|         fp8_gemm_nt((A_deepgemm, A_scale_deepgemm), | ||||
|                                        (B_deepgemm, B_scale_deepgemm), | ||||
|                                        C_deepgemm) | ||||
|         fp8_gemm_nt( | ||||
|             (A_deepgemm, A_scale_deepgemm), (B_deepgemm, B_scale_deepgemm), C_deepgemm | ||||
|         ) | ||||
|         return C_deepgemm | ||||
|  | ||||
|     # === vLLM Triton Implementation === | ||||
|     def vllm_triton_gemm(): | ||||
|         return w8a8_block_fp8_matmul(A_vllm, | ||||
|                                      B_vllm, | ||||
|                                      A_scale_vllm, | ||||
|                                      B_scale_vllm, | ||||
|                                      block_size, | ||||
|                                      output_dtype=torch.bfloat16) | ||||
|         return w8a8_triton_block_scaled_mm( | ||||
|             A_vllm, | ||||
|             B_vllm, | ||||
|             A_scale_vllm, | ||||
|             B_scale_vllm, | ||||
|             block_size, | ||||
|             output_dtype=torch.bfloat16, | ||||
|         ) | ||||
|  | ||||
|     # === vLLM CUTLASS Implementation === | ||||
|     def vllm_cutlass_gemm(): | ||||
|         return ops.cutlass_scaled_mm(A_vllm_cutlass, | ||||
|                                      B_vllm.T, | ||||
|                                      scale_a=A_scale_vllm_cutlass, | ||||
|                                      scale_b=B_scale_vllm.T, | ||||
|                                      out_dtype=torch.bfloat16) | ||||
|         return ops.cutlass_scaled_mm( | ||||
|             A_vllm_cutlass, | ||||
|             B_vllm.T, | ||||
|             scale_a=A_scale_vllm_cutlass, | ||||
|             scale_b=B_scale_vllm.T, | ||||
|             out_dtype=torch.bfloat16, | ||||
|         ) | ||||
|  | ||||
|     # Run correctness check first | ||||
|     if verbose: | ||||
| @ -93,26 +99,23 @@ def benchmark_shape(m: int, | ||||
|         print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}") | ||||
|         print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}") | ||||
|         print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}") | ||||
|         print("vLLM Triton vs DeepGEMM difference: " | ||||
|               f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}") | ||||
|         print("vLLM CUTLASS vs DeepGEMM difference: " | ||||
|               f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}") | ||||
|         print( | ||||
|             "vLLM Triton vs DeepGEMM difference: " | ||||
|             f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}" | ||||
|         ) | ||||
|         print( | ||||
|             "vLLM CUTLASS vs DeepGEMM difference: " | ||||
|             f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}" | ||||
|         ) | ||||
|  | ||||
|     # Benchmark implementations | ||||
|     implementations = { | ||||
|         "DeepGEMM": deepgemm_gemm, | ||||
|         "vLLM Triton": vllm_triton_gemm, | ||||
|         "vLLM CUTLASS": vllm_cutlass_gemm | ||||
|         "vLLM CUTLASS": vllm_cutlass_gemm, | ||||
|     } | ||||
|  | ||||
|     benchmark_results = { | ||||
|         "shape": { | ||||
|             "m": m, | ||||
|             "n": n, | ||||
|             "k": k | ||||
|         }, | ||||
|         "implementations": {} | ||||
|     } | ||||
|     benchmark_results = {"shape": {"m": m, "n": n, "k": k}, "implementations": {}} | ||||
|  | ||||
|     for name, func in implementations.items(): | ||||
|         # Warmup | ||||
| @ -140,38 +143,36 @@ def benchmark_shape(m: int, | ||||
|             "tflops": tflops, | ||||
|             "gb_s": gb_s, | ||||
|             "diff": { | ||||
|                 "DeepGEMM": | ||||
|                 0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm), | ||||
|                 "Reference": | ||||
|                 deepgemm_diff if name == "DeepGEMM" else | ||||
|                 (vllm_triton_diff | ||||
|                  if name == "vLLM Triton" else vllm_cutlass_diff) | ||||
|             } | ||||
|                 "DeepGEMM": 0.0 | ||||
|                 if name == "DeepGEMM" | ||||
|                 else calc_diff(func(), C_deepgemm), | ||||
|                 "Reference": deepgemm_diff | ||||
|                 if name == "DeepGEMM" | ||||
|                 else (vllm_triton_diff if name == "vLLM Triton" else vllm_cutlass_diff), | ||||
|             }, | ||||
|         } | ||||
|  | ||||
|         if verbose: | ||||
|             print( | ||||
|                 f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s" | ||||
|             ) | ||||
|             print(f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s") | ||||
|  | ||||
|     # Calculate speedups | ||||
|     baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"] | ||||
|     for name, data in benchmark_results["implementations"].items(): | ||||
|         if name != "DeepGEMM": | ||||
|             speedup = baseline / data["time_ms"] | ||||
|             benchmark_results["implementations"][name][ | ||||
|                 "speedup_vs_deepgemm"] = speedup | ||||
|             benchmark_results["implementations"][name]["speedup_vs_deepgemm"] = speedup | ||||
|             if verbose: | ||||
|                 print(f"DeepGEMM is {1/speedup:.2f}x " | ||||
|                       f"{'faster' if 1/speedup > 1 else 'slower'} than {name}") | ||||
|                 print( | ||||
|                     f"DeepGEMM is {1 / speedup:.2f}x " | ||||
|                     f"{'faster' if 1 / speedup > 1 else 'slower'} than {name}" | ||||
|                 ) | ||||
|  | ||||
|     vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][ | ||||
|         "time_ms"] | ||||
|     vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][ | ||||
|         "time_ms"] | ||||
|     vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"]["time_ms"] | ||||
|     vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"]["time_ms"] | ||||
|     cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time | ||||
|     benchmark_results["implementations"]["vLLM CUTLASS"][ | ||||
|         "speedup_vs_triton"] = cutlass_vs_triton | ||||
|     benchmark_results["implementations"]["vLLM CUTLASS"]["speedup_vs_triton"] = ( | ||||
|         cutlass_vs_triton | ||||
|     ) | ||||
|     if verbose: | ||||
|         print( | ||||
|             f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x " | ||||
| @ -183,8 +184,7 @@ def benchmark_shape(m: int, | ||||
|  | ||||
| def format_table_row(values, widths): | ||||
|     """Format a row with specified column widths.""" | ||||
|     return "| " + " | ".join(f"{val:{w}}" | ||||
|                              for val, w in zip(values, widths)) + " |" | ||||
|     return "| " + " | ".join(f"{val:{w}}" for val, w in zip(values, widths)) + " |" | ||||
|  | ||||
|  | ||||
| def print_table(headers, rows, title=None): | ||||
| @ -292,38 +292,50 @@ def run_benchmarks(verbose: bool = False): | ||||
|     for result in all_results: | ||||
|         shape = result["shape"] | ||||
|         impl_data = result["implementations"]["DeepGEMM"] | ||||
|         deepgemm_rows.append([ | ||||
|             shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}", | ||||
|             f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}" | ||||
|         ]) | ||||
|         deepgemm_rows.append( | ||||
|             [ | ||||
|                 shape["m"], | ||||
|                 shape["n"], | ||||
|                 shape["k"], | ||||
|                 f"{impl_data['time_us']:.1f}", | ||||
|                 f"{impl_data['tflops']:.1f}", | ||||
|                 f"{impl_data['gb_s']:.1f}", | ||||
|             ] | ||||
|         ) | ||||
|  | ||||
|     print_table(deepgemm_headers, | ||||
|                 deepgemm_rows, | ||||
|                 title="DeepGEMM Implementation:") | ||||
|     print_table(deepgemm_headers, deepgemm_rows, title="DeepGEMM Implementation:") | ||||
|  | ||||
|     # Print vLLM Triton table | ||||
|     triton_headers = [ | ||||
|         "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM" | ||||
|     ] | ||||
|     triton_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"] | ||||
|     triton_rows = [] | ||||
|     for result in all_results: | ||||
|         shape = result["shape"] | ||||
|         impl_data = result["implementations"]["vLLM Triton"] | ||||
|         speedup = impl_data.get("speedup_vs_deepgemm", 1.0) | ||||
|         triton_rows.append([ | ||||
|             shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}", | ||||
|             f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}", | ||||
|             format_speedup(speedup) | ||||
|         ]) | ||||
|         triton_rows.append( | ||||
|             [ | ||||
|                 shape["m"], | ||||
|                 shape["n"], | ||||
|                 shape["k"], | ||||
|                 f"{impl_data['time_us']:.1f}", | ||||
|                 f"{impl_data['tflops']:.1f}", | ||||
|                 f"{impl_data['gb_s']:.1f}", | ||||
|                 format_speedup(speedup), | ||||
|             ] | ||||
|         ) | ||||
|  | ||||
|     print_table(triton_headers, | ||||
|                 triton_rows, | ||||
|                 title="vLLM Triton Implementation:") | ||||
|     print_table(triton_headers, triton_rows, title="vLLM Triton Implementation:") | ||||
|  | ||||
|     # Print vLLM CUTLASS table | ||||
|     cutlass_headers = [ | ||||
|         "m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM", | ||||
|         "vs Triton" | ||||
|         "m", | ||||
|         "n", | ||||
|         "k", | ||||
|         "Time (μs)", | ||||
|         "TFLOPS", | ||||
|         "GB/s", | ||||
|         "vs DeepGEMM", | ||||
|         "vs Triton", | ||||
|     ] | ||||
|     cutlass_rows = [] | ||||
|     for result in all_results: | ||||
| @ -331,28 +343,27 @@ def run_benchmarks(verbose: bool = False): | ||||
|         impl_data = result["implementations"]["vLLM CUTLASS"] | ||||
|         vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0) | ||||
|         vs_triton = impl_data.get("speedup_vs_triton", 1.0) | ||||
|         cutlass_rows.append([ | ||||
|             shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}", | ||||
|             f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}", | ||||
|             format_speedup(vs_deepgemm), | ||||
|             format_speedup(vs_triton) | ||||
|         ]) | ||||
|         cutlass_rows.append( | ||||
|             [ | ||||
|                 shape["m"], | ||||
|                 shape["n"], | ||||
|                 shape["k"], | ||||
|                 f"{impl_data['time_us']:.1f}", | ||||
|                 f"{impl_data['tflops']:.1f}", | ||||
|                 f"{impl_data['gb_s']:.1f}", | ||||
|                 format_speedup(vs_deepgemm), | ||||
|                 format_speedup(vs_triton), | ||||
|             ] | ||||
|         ) | ||||
|  | ||||
|     print_table(cutlass_headers, | ||||
|                 cutlass_rows, | ||||
|                 title="vLLM CUTLASS Implementation:") | ||||
|     print_table(cutlass_headers, cutlass_rows, title="vLLM CUTLASS Implementation:") | ||||
|  | ||||
|     # Calculate and print averages | ||||
|     print("\n===== AVERAGE PERFORMANCE =====") | ||||
|  | ||||
|     implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"] | ||||
|     avg_metrics = { | ||||
|         impl: { | ||||
|             "tflops": 0, | ||||
|             "gb_s": 0, | ||||
|             "time_ms": 0 | ||||
|         } | ||||
|         for impl in implementations | ||||
|         impl: {"tflops": 0, "gb_s": 0, "time_ms": 0} for impl in implementations | ||||
|     } | ||||
|  | ||||
|     for result in all_results: | ||||
| @ -370,9 +381,9 @@ def run_benchmarks(verbose: bool = False): | ||||
|         avg_tflops = avg_metrics[impl]["tflops"] / num_shapes | ||||
|         avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes | ||||
|         avg_time = avg_metrics[impl]["time_ms"] / num_shapes | ||||
|         avg_rows.append([ | ||||
|             impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}" | ||||
|         ]) | ||||
|         avg_rows.append( | ||||
|             [impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"] | ||||
|         ) | ||||
|  | ||||
|     print_table(avg_headers, avg_rows) | ||||
|  | ||||
| @ -380,21 +391,19 @@ def run_benchmarks(verbose: bool = False): | ||||
|     avg_speedups = { | ||||
|         "DeepGEMM vs vLLM Triton": 0, | ||||
|         "DeepGEMM vs vLLM CUTLASS": 0, | ||||
|         "vLLM CUTLASS vs vLLM Triton": 0 | ||||
|         "vLLM CUTLASS vs vLLM Triton": 0, | ||||
|     } | ||||
|  | ||||
|     for result in all_results: | ||||
|         deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"] | ||||
|         vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"] | ||||
|         vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][ | ||||
|             "time_ms"] | ||||
|         vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"]["time_ms"] | ||||
|  | ||||
|         avg_speedups[ | ||||
|             "DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time | ||||
|         avg_speedups[ | ||||
|             "DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time | ||||
|         avg_speedups[ | ||||
|             "vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time | ||||
|         avg_speedups["DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time | ||||
|         avg_speedups["DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time | ||||
|         avg_speedups["vLLM CUTLASS vs vLLM Triton"] += ( | ||||
|             vllm_triton_time / vllm_cutlass_time | ||||
|         ) | ||||
|  | ||||
|     print("\n===== AVERAGE SPEEDUPS =====") | ||||
|     speedup_headers = ["Comparison", "Speedup"] | ||||
| @ -412,8 +421,7 @@ def run_benchmarks(verbose: bool = False): | ||||
|  | ||||
|     for result in all_results: | ||||
|         for impl in implementations: | ||||
|             avg_diff[impl] += result["implementations"][impl]["diff"][ | ||||
|                 "Reference"] | ||||
|             avg_diff[impl] += result["implementations"][impl]["diff"]["Reference"] | ||||
|  | ||||
|     diff_headers = ["Implementation", "Avg Diff vs Reference"] | ||||
|     diff_rows = [] | ||||
|  | ||||
| @ -2,8 +2,8 @@ | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
|  | ||||
| import dataclasses | ||||
| from collections.abc import Iterable | ||||
| from typing import Any, Callable, Optional | ||||
| from collections.abc import Callable, Iterable | ||||
| from typing import Any | ||||
|  | ||||
| import torch | ||||
| import torch.utils.benchmark as TBenchmark | ||||
| @ -55,7 +55,7 @@ class Bench: | ||||
|  | ||||
|     def __init__( | ||||
|         self, | ||||
|         cuda_graph_params: Optional[CudaGraphBenchParams], | ||||
|         cuda_graph_params: CudaGraphBenchParams | None, | ||||
|         label: str, | ||||
|         sub_label: str, | ||||
|         description: str, | ||||
|  | ||||
| @ -2,7 +2,7 @@ | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
| from abc import ABC, abstractmethod | ||||
| from statistics import mean | ||||
| from typing import Any, NamedTuple, Optional, Union | ||||
| from typing import Any, NamedTuple | ||||
|  | ||||
| import numpy as np  # type: ignore | ||||
| import pandas as pd  # type: ignore | ||||
| @ -35,8 +35,8 @@ class Distribution(ABC): | ||||
| class UniformDistribution(Distribution): | ||||
|     def __init__( | ||||
|         self, | ||||
|         min_val: Union[int, float], | ||||
|         max_val: Union[int, float], | ||||
|         min_val: int | float, | ||||
|         max_val: int | float, | ||||
|         is_integer: bool = True, | ||||
|     ) -> None: | ||||
|         self.min_val = min_val | ||||
| @ -56,7 +56,7 @@ class UniformDistribution(Distribution): | ||||
|  | ||||
|  | ||||
| class ConstantDistribution(Distribution): | ||||
|     def __init__(self, value: Union[int, float]) -> None: | ||||
|     def __init__(self, value: int | float) -> None: | ||||
|         self.value = value | ||||
|         self.max_val = value | ||||
|  | ||||
| @ -68,7 +68,7 @@ class ConstantDistribution(Distribution): | ||||
|  | ||||
|  | ||||
| class ZipfDistribution(Distribution): | ||||
|     def __init__(self, alpha: float, max_val: Optional[int] = None) -> None: | ||||
|     def __init__(self, alpha: float, max_val: int | None = None) -> None: | ||||
|         self.alpha = alpha | ||||
|         self.max_val = max_val | ||||
|  | ||||
| @ -83,7 +83,7 @@ class ZipfDistribution(Distribution): | ||||
|  | ||||
|  | ||||
| class PoissonDistribution(Distribution): | ||||
|     def __init__(self, alpha: float, max_val: Optional[int] = None) -> None: | ||||
|     def __init__(self, alpha: float, max_val: int | None = None) -> None: | ||||
|         self.alpha = alpha | ||||
|         self.max_val = max_val | ||||
|  | ||||
| @ -100,11 +100,11 @@ class PoissonDistribution(Distribution): | ||||
| class LognormalDistribution(Distribution): | ||||
|     def __init__( | ||||
|         self, | ||||
|         mean: Optional[float] = None, | ||||
|         sigma: Optional[float] = None, | ||||
|         average: Optional[int] = None, | ||||
|         median_ratio: Optional[float] = None, | ||||
|         max_val: Optional[int] = None, | ||||
|         mean: float | None = None, | ||||
|         sigma: float | None = None, | ||||
|         average: int | None = None, | ||||
|         median_ratio: float | None = None, | ||||
|         max_val: int | None = None, | ||||
|     ) -> None: | ||||
|         self.average = average | ||||
|         self.median_ratio = median_ratio | ||||
|  | ||||
| @ -13,7 +13,7 @@ from datetime import datetime | ||||
| from enum import Enum | ||||
| from http import HTTPStatus | ||||
| from statistics import mean | ||||
| from typing import NamedTuple, Optional, Union | ||||
| from typing import NamedTuple | ||||
|  | ||||
| import aiohttp  # type: ignore | ||||
| import numpy as np  # type: ignore | ||||
| @ -46,9 +46,9 @@ class ConversationSampling(str, Enum): | ||||
|  | ||||
| class ClientArgs(NamedTuple): | ||||
|     seed: int | ||||
|     max_num_requests: Optional[int] | ||||
|     max_num_requests: int | None | ||||
|     skip_first_turn: bool | ||||
|     max_turns: Optional[int] | ||||
|     max_turns: int | None | ||||
|     max_active_conversations: int | ||||
|     verbose: bool | ||||
|     print_content: bool | ||||
| @ -109,9 +109,9 @@ class RequestStats(NamedTuple): | ||||
|  | ||||
| class MetricStats: | ||||
|     def __init__(self) -> None: | ||||
|         self.min: Optional[float] = None | ||||
|         self.max: Optional[float] = None | ||||
|         self.avg: Optional[float] = None | ||||
|         self.min: float | None = None | ||||
|         self.max: float | None = None | ||||
|         self.avg: float | None = None | ||||
|         self.sum = 0.0 | ||||
|         self.count = 0 | ||||
|  | ||||
| @ -143,7 +143,7 @@ class MovingAverage: | ||||
|         self.index = 0 | ||||
|         self.sum = 0.0 | ||||
|         self.count = 0 | ||||
|         self.avg: Optional[float] = None | ||||
|         self.avg: float | None = None | ||||
|  | ||||
|     def update(self, new_value: float) -> None: | ||||
|         if self.count < self.window_size: | ||||
| @ -169,7 +169,7 @@ class MovingAverage: | ||||
| class DebugStats: | ||||
|     def __init__(self, logger: logging.Logger, window_size: int) -> None: | ||||
|         self.logger = logger | ||||
|         self.metrics: dict[str, Union[MovingAverage, MetricStats]] = { | ||||
|         self.metrics: dict[str, MovingAverage | MetricStats] = { | ||||
|             "moving_avg_ttft_ms": MovingAverage(window_size), | ||||
|             "moving_avg_tpot_ms": MovingAverage(window_size), | ||||
|             "ttft_ms": MetricStats(), | ||||
| @ -198,14 +198,6 @@ class DebugStats: | ||||
|         self.logger.info("-" * 50) | ||||
|  | ||||
|  | ||||
| # Must support Python 3.8, we can't use str.removeprefix(prefix) | ||||
| # introduced in Python 3.9 | ||||
| def remove_prefix(text: str, prefix: str) -> str: | ||||
|     if text.startswith(prefix): | ||||
|         return text[len(prefix) :] | ||||
|     return text | ||||
|  | ||||
|  | ||||
| def nanosec_to_millisec(value: float) -> float: | ||||
|     return value / 1000000.0 | ||||
|  | ||||
| @ -220,8 +212,8 @@ async def send_request( | ||||
|     chat_url: str, | ||||
|     model: str, | ||||
|     stream: bool = True, | ||||
|     min_tokens: Optional[int] = None, | ||||
|     max_tokens: Optional[int] = None, | ||||
|     min_tokens: int | None = None, | ||||
|     max_tokens: int | None = None, | ||||
| ) -> ServerResponse: | ||||
|     payload = { | ||||
|         "model": model, | ||||
| @ -250,9 +242,9 @@ async def send_request( | ||||
|     timeout = aiohttp.ClientTimeout(total=timeout_sec) | ||||
|  | ||||
|     valid_response = True | ||||
|     ttft: Optional[float] = None | ||||
|     ttft: float | None = None | ||||
|     chunk_delay: list[int] = [] | ||||
|     latency: Optional[float] = None | ||||
|     latency: float | None = None | ||||
|     first_chunk = "" | ||||
|     generated_text = "" | ||||
|  | ||||
| @ -269,7 +261,7 @@ async def send_request( | ||||
|                 if not chunk_bytes: | ||||
|                     continue | ||||
|  | ||||
|                 chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ") | ||||
|                 chunk = chunk_bytes.decode("utf-8").removeprefix("data: ") | ||||
|                 if chunk == "[DONE]": | ||||
|                     # End of stream | ||||
|                     latency = time.perf_counter_ns() - start_time | ||||
| @ -364,7 +356,7 @@ async def send_turn( | ||||
|     req_args: RequestArgs, | ||||
|     verbose: bool, | ||||
|     verify_output: bool, | ||||
| ) -> Optional[RequestStats]: | ||||
| ) -> RequestStats | None: | ||||
|     assert messages_to_use > 0 | ||||
|     assert messages_to_use <= len(conversation_messages) | ||||
|  | ||||
| @ -644,7 +636,7 @@ async def client_main( | ||||
|  | ||||
|             if args.verbose: | ||||
|                 curr_time_sec: float = time.perf_counter() | ||||
|                 time_since_last_turn: Union[str, float] = "N/A" | ||||
|                 time_since_last_turn: str | float = "N/A" | ||||
|                 if conv_id in time_of_last_turn: | ||||
|                     time_since_last_turn = round( | ||||
|                         curr_time_sec - time_of_last_turn[conv_id], 3 | ||||
| @ -769,7 +761,7 @@ def get_client_config( | ||||
|             "Number of conversations must be equal or larger than the number of clients" | ||||
|         ) | ||||
|  | ||||
|     max_req_per_client: Optional[int] = None | ||||
|     max_req_per_client: int | None = None | ||||
|     if args.max_num_requests is not None: | ||||
|         # Max number of requests per client | ||||
|         req_per_client = args.max_num_requests // args.num_clients | ||||
| @ -936,13 +928,13 @@ async def main_mp( | ||||
|                     f"{num_clients_finished} out of {bench_args.num_clients} clients finished, collected {len(client_metrics)} measurements, runtime {runtime_sec:.3f} sec{Color.RESET}"  # noqa: E501 | ||||
|                 ) | ||||
|  | ||||
|                 rps: Union[str, float] = round(len(client_metrics) / runtime_sec, 3) | ||||
|                 rps: str | float = round(len(client_metrics) / runtime_sec, 3) | ||||
|                 if len(client_metrics) < (5 * bench_args.num_clients): | ||||
|                     # Do not estimate the RPS if the number of samples is very low | ||||
|                     # (threshold can be tuned if needed) | ||||
|                     rps = "N/A" | ||||
|  | ||||
|                 runtime_left_sec: Union[str, float] = round( | ||||
|                 runtime_left_sec: str | float = round( | ||||
|                     (runtime_sec / finished_convs) * (total_convs - finished_convs), 3 | ||||
|                 ) | ||||
|                 if percent < 0.05: | ||||
| @ -1032,7 +1024,7 @@ def process_statistics( | ||||
|     warmup_percentages: list[float], | ||||
|     test_params: dict, | ||||
|     verbose: bool, | ||||
|     gen_conv_args: Optional[GenConvArgs] = None, | ||||
|     gen_conv_args: GenConvArgs | None = None, | ||||
|     excel_output: bool = False, | ||||
| ) -> None: | ||||
|     if len(client_metrics) == 0: | ||||
| @ -1259,7 +1251,7 @@ async def main() -> None: | ||||
|         default=None, | ||||
|         help="The model name used in the API. " | ||||
|         "If not specified, the model name will be the " | ||||
|         "same as the ``--model`` argument. ", | ||||
|         "same as the `--model` argument. ", | ||||
|     ) | ||||
|  | ||||
|     parser.add_argument( | ||||
|  | ||||
| @ -13,7 +13,7 @@ import argparse | ||||
| import json | ||||
| import random | ||||
| from statistics import mean | ||||
| from typing import Any, Optional | ||||
| from typing import Any | ||||
|  | ||||
| import pandas as pd  # type: ignore | ||||
| import tqdm  # type: ignore | ||||
| @ -25,7 +25,7 @@ def has_non_english_chars(text: str) -> bool: | ||||
|  | ||||
|  | ||||
| def content_is_valid( | ||||
|     content: str, min_content_len: Optional[int], max_content_len: Optional[int] | ||||
|     content: str, min_content_len: int | None, max_content_len: int | None | ||||
| ) -> bool: | ||||
|     if min_content_len and len(content) < min_content_len: | ||||
|         return False | ||||
| @ -37,7 +37,7 @@ def content_is_valid( | ||||
|  | ||||
|  | ||||
| def print_stats( | ||||
|     conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None | ||||
|     conversations: "list[dict[Any, Any]]", tokenizer: AutoTokenizer | None = None | ||||
| ) -> None: | ||||
|     # Collect statistics | ||||
|     stats = [] | ||||
| @ -109,12 +109,12 @@ def convert_sharegpt_to_openai( | ||||
|     seed: int, | ||||
|     input_file: str, | ||||
|     output_file: str, | ||||
|     max_items: Optional[int], | ||||
|     min_content_len: Optional[int] = None, | ||||
|     max_content_len: Optional[int] = None, | ||||
|     min_turns: Optional[int] = None, | ||||
|     max_turns: Optional[int] = None, | ||||
|     model: Optional[str] = None, | ||||
|     max_items: int | None, | ||||
|     min_content_len: int | None = None, | ||||
|     max_content_len: int | None = None, | ||||
|     min_turns: int | None = None, | ||||
|     max_turns: int | None = None, | ||||
|     model: str | None = None, | ||||
| ) -> None: | ||||
|     if min_turns and max_turns: | ||||
|         assert min_turns <= max_turns | ||||
|  | ||||
| @ -1,49 +0,0 @@ | ||||
| # This local pyproject file is part of the migration from yapf to ruff format. | ||||
| # It uses the same core rules as the main pyproject.toml file, but with the | ||||
| # following differences: | ||||
| # - ruff line length is overridden to 88 | ||||
| # - deprecated typing ignores (UP006, UP035) have been removed | ||||
|  | ||||
| [tool.ruff] | ||||
| line-length = 88 | ||||
|  | ||||
| [tool.ruff.lint.per-file-ignores] | ||||
| "vllm/third_party/**" = ["ALL"] | ||||
| "vllm/version.py" = ["F401"] | ||||
| "vllm/_version.py" = ["ALL"] | ||||
|  | ||||
| [tool.ruff.lint] | ||||
| select = [ | ||||
|     # pycodestyle | ||||
|     "E", | ||||
|     # Pyflakes | ||||
|     "F", | ||||
|     # pyupgrade | ||||
|     "UP", | ||||
|     # flake8-bugbear | ||||
|     "B", | ||||
|     # flake8-simplify | ||||
|     "SIM", | ||||
|     # isort | ||||
|     "I", | ||||
|     # flake8-logging-format | ||||
|     "G", | ||||
| ] | ||||
| ignore = [ | ||||
|     # star imports | ||||
|     "F405", "F403", | ||||
|     # lambda expression assignment | ||||
|     "E731", | ||||
|     # Loop control variable not used within loop body | ||||
|     "B007", | ||||
|     # f-string format | ||||
|     "UP032", | ||||
|     # Can remove once 3.10+ is the minimum Python version | ||||
|     "UP007", | ||||
| ] | ||||
|  | ||||
| [tool.ruff.lint.isort] | ||||
| known-first-party = ["vllm"] | ||||
|  | ||||
| [tool.ruff.format] | ||||
| docstring-code-format = true | ||||
| @ -198,13 +198,24 @@ else() | ||||
| endif() | ||||
|  | ||||
| if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON_FOUND) OR POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND) | ||||
|     FetchContent_Declare( | ||||
|         oneDNN | ||||
|         GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git | ||||
|         GIT_TAG v3.9 | ||||
|         GIT_PROGRESS TRUE | ||||
|         GIT_SHALLOW TRUE | ||||
|     ) | ||||
|     set(FETCHCONTENT_SOURCE_DIR_ONEDNN "$ENV{FETCHCONTENT_SOURCE_DIR_ONEDNN}" CACHE PATH "Path to a local oneDNN source directory.") | ||||
|  | ||||
|     if(FETCHCONTENT_SOURCE_DIR_ONEDNN) | ||||
|         message(STATUS "Using oneDNN from specified source directory: ${FETCHCONTENT_SOURCE_DIR_ONEDNN}") | ||||
|         FetchContent_Declare( | ||||
|             oneDNN | ||||
|             SOURCE_DIR ${FETCHCONTENT_SOURCE_DIR_ONEDNN} | ||||
|         ) | ||||
|     else() | ||||
|         message(STATUS "Downloading oneDNN from GitHub") | ||||
|         FetchContent_Declare( | ||||
|             oneDNN | ||||
|             GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git | ||||
|             GIT_TAG v3.9 | ||||
|             GIT_PROGRESS TRUE | ||||
|             GIT_SHALLOW TRUE | ||||
|         ) | ||||
|     endif() | ||||
|  | ||||
|     if(USE_ACL) | ||||
|         find_library(ARM_COMPUTE_LIBRARY NAMES arm_compute PATHS $ENV{ACL_ROOT_DIR}/build/) | ||||
| @ -213,6 +224,7 @@ if ((AVX512_FOUND AND NOT AVX512_DISABLED) OR (ASIMD_FOUND AND NOT APPLE_SILICON | ||||
|         endif() | ||||
|         set(ONEDNN_AARCH64_USE_ACL "ON") | ||||
|         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wl,-rpath,$ENV{ACL_ROOT_DIR}/build/") | ||||
|         add_compile_definitions(VLLM_USE_ACL) | ||||
|     endif() | ||||
|  | ||||
|     set(ONEDNN_LIBRARY_TYPE "STATIC") | ||||
| @ -308,4 +320,4 @@ define_gpu_extension_target( | ||||
|     WITH_SOABI | ||||
| ) | ||||
|  | ||||
| message(STATUS "Enabling C extension.") | ||||
| message(STATUS "Enabling C extension.") | ||||
| @ -18,8 +18,8 @@ if(FLASH_MLA_SRC_DIR) | ||||
| else() | ||||
|   FetchContent_Declare( | ||||
|         flashmla | ||||
|         GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git | ||||
|         GIT_TAG a757314c04eedd166e329e846c820eb1bdd702de | ||||
|         GIT_REPOSITORY https://github.com/vllm-project/FlashMLA | ||||
|         GIT_TAG 5f65b85703c7ed75fda01e06495077caad207c3f | ||||
|         GIT_PROGRESS TRUE | ||||
|         CONFIGURE_COMMAND "" | ||||
|         BUILD_COMMAND "" | ||||
| @ -33,23 +33,64 @@ message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}") | ||||
| # The FlashMLA kernels only work on hopper and require CUDA 12.3 or later. | ||||
| # Only build FlashMLA kernels if we are building for something compatible with  | ||||
| # sm90a | ||||
| cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}") | ||||
| if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS) | ||||
|  | ||||
| set(SUPPORT_ARCHS) | ||||
| if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3) | ||||
|     list(APPEND SUPPORT_ARCHS 9.0a) | ||||
| endif() | ||||
| if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8) | ||||
|     list(APPEND SUPPORT_ARCHS 10.0a) | ||||
| endif() | ||||
|  | ||||
|  | ||||
| cuda_archs_loose_intersection(FLASH_MLA_ARCHS "${SUPPORT_ARCHS}" "${CUDA_ARCHS}") | ||||
| if(FLASH_MLA_ARCHS) | ||||
|     set(VLLM_FLASHMLA_GPU_FLAGS ${VLLM_GPU_FLAGS}) | ||||
|     list(APPEND VLLM_FLASHMLA_GPU_FLAGS "--expt-relaxed-constexpr" "--expt-extended-lambda" "--use_fast_math") | ||||
|  | ||||
|     set(FlashMLA_SOURCES | ||||
|         ${flashmla_SOURCE_DIR}/csrc/flash_api.cpp | ||||
|         ${flashmla_SOURCE_DIR}/csrc/kernels/get_mla_metadata.cu | ||||
|         ${flashmla_SOURCE_DIR}/csrc/kernels/mla_combine.cu | ||||
|         ${flashmla_SOURCE_DIR}/csrc/kernels/splitkv_mla.cu | ||||
|         ${flashmla_SOURCE_DIR}/csrc/kernels_fp8/flash_fwd_mla_fp8_sm90.cu) | ||||
|         ${flashmla_SOURCE_DIR}/csrc/torch_api.cpp | ||||
|         ${flashmla_SOURCE_DIR}/csrc/pybind.cpp | ||||
|         ${flashmla_SOURCE_DIR}/csrc/smxx/get_mla_metadata.cu | ||||
|         ${flashmla_SOURCE_DIR}/csrc/smxx/mla_combine.cu | ||||
|         ${flashmla_SOURCE_DIR}/csrc/sm90/decode/dense/splitkv_mla.cu | ||||
|         ${flashmla_SOURCE_DIR}/csrc/sm90/decode/sparse_fp8/splitkv_mla.cu | ||||
|         ${flashmla_SOURCE_DIR}/csrc/sm90/prefill/sparse/fwd.cu | ||||
|         ${flashmla_SOURCE_DIR}/csrc/sm100/decode/sparse_fp8/splitkv_mla.cu | ||||
|         ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_fwd_sm100.cu | ||||
|         ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/dense/fmha_cutlass_bwd_sm100.cu | ||||
|         ${flashmla_SOURCE_DIR}/csrc/sm100/prefill/sparse/fwd.cu | ||||
|     ) | ||||
|  | ||||
|     set(FlashMLA_Extension_SOURCES | ||||
|         ${flashmla_SOURCE_DIR}/csrc/extension/torch_api.cpp | ||||
|         ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/pybind.cpp | ||||
|         ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/flash_fwd_mla_fp8_sm90.cu | ||||
|     ) | ||||
|  | ||||
|     set(FlashMLA_INCLUDES | ||||
|         ${flashmla_SOURCE_DIR}/csrc | ||||
|         ${flashmla_SOURCE_DIR}/csrc/sm90 | ||||
|         ${flashmla_SOURCE_DIR}/csrc/cutlass/include | ||||
|         ${flashmla_SOURCE_DIR}/csrc) | ||||
|         ${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include | ||||
|     ) | ||||
|  | ||||
|     set(FlashMLA_Extension_INCLUDES | ||||
|         ${flashmla_SOURCE_DIR}/csrc | ||||
|         ${flashmla_SOURCE_DIR}/csrc/sm90 | ||||
|         ${flashmla_SOURCE_DIR}/csrc/extension/sm90/dense_fp8/ | ||||
|         ${flashmla_SOURCE_DIR}/csrc/cutlass/include | ||||
|         ${flashmla_SOURCE_DIR}/csrc/cutlass/tools/util/include | ||||
|     ) | ||||
|  | ||||
|     set_gencode_flags_for_srcs( | ||||
|         SRCS "${FlashMLA_SOURCES}" | ||||
|         CUDA_ARCHS "${FLASH_MLA_ARCHS}") | ||||
|  | ||||
|     set_gencode_flags_for_srcs( | ||||
|         SRCS "${FlashMLA_Extension_SOURCES}" | ||||
|         CUDA_ARCHS "${FLASH_MLA_ARCHS}") | ||||
|  | ||||
|     define_gpu_extension_target( | ||||
|         _flashmla_C | ||||
|         DESTINATION vllm | ||||
| @ -60,8 +101,32 @@ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS) | ||||
|         INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES} | ||||
|         USE_SABI 3 | ||||
|         WITH_SOABI) | ||||
|  | ||||
|     # Keep Stable ABI for the module, but *not* for CUDA/C++ files. | ||||
|     # This prevents Py_LIMITED_API from affecting nvcc and C++ compiles. | ||||
|     target_compile_options(_flashmla_C PRIVATE | ||||
|         $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API> | ||||
|         $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>) | ||||
|  | ||||
|     define_gpu_extension_target( | ||||
|         _flashmla_extension_C | ||||
|         DESTINATION vllm | ||||
|         LANGUAGE ${VLLM_GPU_LANG} | ||||
|         SOURCES ${FlashMLA_Extension_SOURCES} | ||||
|         COMPILE_FLAGS ${VLLM_FLASHMLA_GPU_FLAGS} | ||||
|         ARCHITECTURES ${VLLM_GPU_ARCHES} | ||||
|         INCLUDE_DIRECTORIES ${FlashMLA_Extension_INCLUDES} | ||||
|         USE_SABI 3 | ||||
|         WITH_SOABI) | ||||
|  | ||||
|     # Keep Stable ABI for the module, but *not* for CUDA/C++ files. | ||||
|     # This prevents Py_LIMITED_API from affecting nvcc and C++ compiles. | ||||
|     target_compile_options(_flashmla_extension_C PRIVATE | ||||
|         $<$<COMPILE_LANGUAGE:CUDA>:-UPy_LIMITED_API> | ||||
|         $<$<COMPILE_LANGUAGE:CXX>:-UPy_LIMITED_API>) | ||||
| else() | ||||
|     # Create an empty target for setup.py when not targeting sm90a systems | ||||
|     # Create empty targets for setup.py when not targeting sm90a systems | ||||
|     add_custom_target(_flashmla_C) | ||||
|     add_custom_target(_flashmla_extension_C) | ||||
| endif() | ||||
|  | ||||
|  | ||||
							
								
								
									
										97
									
								
								cmake/external_projects/qutlass.cmake
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										97
									
								
								cmake/external_projects/qutlass.cmake
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,97 @@ | ||||
| include(FetchContent) | ||||
|  | ||||
| set(CUTLASS_INCLUDE_DIR "${CUTLASS_INCLUDE_DIR}" CACHE PATH "Path to CUTLASS include/ directory") | ||||
|  | ||||
| if(DEFINED ENV{QUTLASS_SRC_DIR}) | ||||
|   set(QUTLASS_SRC_DIR $ENV{QUTLASS_SRC_DIR}) | ||||
| endif() | ||||
|  | ||||
| if(QUTLASS_SRC_DIR) | ||||
|   FetchContent_Declare( | ||||
|     qutlass | ||||
|     SOURCE_DIR ${QUTLASS_SRC_DIR} | ||||
|     CONFIGURE_COMMAND "" | ||||
|     BUILD_COMMAND "" | ||||
|   ) | ||||
| else() | ||||
|   FetchContent_Declare( | ||||
|     qutlass | ||||
|     GIT_REPOSITORY https://github.com/IST-DASLab/qutlass.git | ||||
|     GIT_TAG 830d2c4537c7396e14a02a46fbddd18b5d107c65 | ||||
|     GIT_PROGRESS TRUE | ||||
|     CONFIGURE_COMMAND "" | ||||
|     BUILD_COMMAND "" | ||||
|   ) | ||||
| endif() | ||||
|  | ||||
| FetchContent_Populate(qutlass) | ||||
|  | ||||
| if(NOT qutlass_SOURCE_DIR) | ||||
|   message(FATAL_ERROR "[QUTLASS] source directory could not be resolved.") | ||||
| endif() | ||||
| message(STATUS "[QUTLASS] QuTLASS is available at ${qutlass_SOURCE_DIR}") | ||||
|  | ||||
| cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0a" "${CUDA_ARCHS}") | ||||
| if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND QUTLASS_ARCHS) | ||||
|  | ||||
|   if(QUTLASS_ARCHS MATCHES "10\\.0a") | ||||
|     set(QUTLASS_TARGET_CC 100) | ||||
|   elseif(QUTLASS_ARCHS MATCHES "12\\.0a") | ||||
|     set(QUTLASS_TARGET_CC 120) | ||||
|   else() | ||||
|     message(FATAL_ERROR "[QUTLASS] internal error parsing CUDA_ARCHS='${QUTLASS_ARCHS}'.") | ||||
|   endif() | ||||
|  | ||||
|   set(QUTLASS_SOURCES | ||||
|     ${qutlass_SOURCE_DIR}/qutlass/csrc/bindings.cpp | ||||
|     ${qutlass_SOURCE_DIR}/qutlass/csrc/gemm.cu | ||||
|     ${qutlass_SOURCE_DIR}/qutlass/csrc/gemm_ada.cu | ||||
|     ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx.cu | ||||
|     ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv.cu | ||||
|     ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_mx_sm100.cu | ||||
|     ${qutlass_SOURCE_DIR}/qutlass/csrc/fused_quantize_nv_sm100.cu | ||||
|   ) | ||||
|  | ||||
|   set(QUTLASS_INCLUDES | ||||
|     ${qutlass_SOURCE_DIR} | ||||
|     ${qutlass_SOURCE_DIR}/qutlass | ||||
|     ${qutlass_SOURCE_DIR}/qutlass/csrc/include | ||||
|     ${qutlass_SOURCE_DIR}/qutlass/csrc/include/cutlass_extensions | ||||
|   ) | ||||
|  | ||||
|   if(CUTLASS_INCLUDE_DIR AND EXISTS "${CUTLASS_INCLUDE_DIR}/cutlass/cutlass.h") | ||||
|     list(APPEND QUTLASS_INCLUDES "${CUTLASS_INCLUDE_DIR}") | ||||
|   elseif(EXISTS "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include/cutlass/cutlass.h") | ||||
|     list(APPEND QUTLASS_INCLUDES "${qutlass_SOURCE_DIR}/qutlass/third_party/cutlass/include") | ||||
|     message(STATUS "[QUTLASS] Using QuTLASS vendored CUTLASS headers (no vLLM CUTLASS detected).") | ||||
|   else() | ||||
|     message(FATAL_ERROR "[QUTLASS] CUTLASS headers not found. " | ||||
|                         "Set -DCUTLASS_INCLUDE_DIR=/path/to/cutlass/include") | ||||
|   endif() | ||||
|  | ||||
|   set_gencode_flags_for_srcs( | ||||
|     SRCS "${QUTLASS_SOURCES}" | ||||
|     CUDA_ARCHS "${QUTLASS_ARCHS}" | ||||
|   ) | ||||
|  | ||||
|   target_sources(_C PRIVATE ${QUTLASS_SOURCES}) | ||||
|   target_include_directories(_C PRIVATE ${QUTLASS_INCLUDES}) | ||||
|   target_compile_definitions(_C PRIVATE | ||||
|     QUTLASS_DISABLE_PYBIND=1 | ||||
|     TARGET_CUDA_ARCH=${QUTLASS_TARGET_CC} | ||||
|   ) | ||||
|  | ||||
|   set_property(SOURCE ${QUTLASS_SOURCES} APPEND PROPERTY COMPILE_OPTIONS | ||||
|     $<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr --use_fast_math -O3> | ||||
|   ) | ||||
|  | ||||
| else() | ||||
|   if("${CMAKE_CUDA_COMPILER_VERSION}" VERSION_LESS "12.8") | ||||
|     message(STATUS | ||||
|       "[QUTLASS] Skipping build: CUDA 12.8 or newer is required (found ${CMAKE_CUDA_COMPILER_VERSION}).") | ||||
|   else() | ||||
|     message(STATUS | ||||
|       "[QUTLASS] Skipping build: no supported arch (12.0a / 10.0a) found in " | ||||
|       "CUDA_ARCHS='${CUDA_ARCHS}'.") | ||||
|   endif() | ||||
| endif() | ||||
| @ -38,7 +38,7 @@ else() | ||||
|   FetchContent_Declare( | ||||
|           vllm-flash-attn | ||||
|           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git | ||||
|           GIT_TAG ee4d25bd84e0cbc7e0b9b9685085fd5db2dcb62a | ||||
|           GIT_TAG a893712401d70362fbb299cd9c4b3476e8e9ed54 | ||||
|           GIT_PROGRESS TRUE | ||||
|           # Don't share the vllm-flash-attn build between build types | ||||
|           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn | ||||
|  | ||||
| @ -16,7 +16,7 @@ import shutil | ||||
|  | ||||
| from torch.utils.hipify.hipify_python import hipify | ||||
|  | ||||
| if __name__ == '__main__': | ||||
| if __name__ == "__main__": | ||||
|     parser = argparse.ArgumentParser() | ||||
|  | ||||
|     # Project directory where all the source + include files live. | ||||
| @ -34,15 +34,14 @@ if __name__ == '__main__': | ||||
|     ) | ||||
|  | ||||
|     # Source files to convert. | ||||
|     parser.add_argument("sources", | ||||
|                         help="Source files to hipify.", | ||||
|                         nargs="*", | ||||
|                         default=[]) | ||||
|     parser.add_argument( | ||||
|         "sources", help="Source files to hipify.", nargs="*", default=[] | ||||
|     ) | ||||
|  | ||||
|     args = parser.parse_args() | ||||
|  | ||||
|     # Limit include scope to project_dir only | ||||
|     includes = [os.path.join(args.project_dir, '*')] | ||||
|     includes = [os.path.join(args.project_dir, "*")] | ||||
|  | ||||
|     # Get absolute path for all source files. | ||||
|     extra_files = [os.path.abspath(s) for s in args.sources] | ||||
| @ -51,25 +50,31 @@ if __name__ == '__main__': | ||||
|     # The directory might already exist to hold object files so we ignore that. | ||||
|     shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True) | ||||
|  | ||||
|     hipify_result = hipify(project_directory=args.project_dir, | ||||
|                            output_directory=args.output_dir, | ||||
|                            header_include_dirs=[], | ||||
|                            includes=includes, | ||||
|                            extra_files=extra_files, | ||||
|                            show_detailed=True, | ||||
|                            is_pytorch_extension=True, | ||||
|                            hipify_extra_files_only=True) | ||||
|     hipify_result = hipify( | ||||
|         project_directory=args.project_dir, | ||||
|         output_directory=args.output_dir, | ||||
|         header_include_dirs=[], | ||||
|         includes=includes, | ||||
|         extra_files=extra_files, | ||||
|         show_detailed=True, | ||||
|         is_pytorch_extension=True, | ||||
|         hipify_extra_files_only=True, | ||||
|     ) | ||||
|  | ||||
|     hipified_sources = [] | ||||
|     for source in args.sources: | ||||
|         s_abs = os.path.abspath(source) | ||||
|         hipified_s_abs = (hipify_result[s_abs].hipified_path if | ||||
|                           (s_abs in hipify_result | ||||
|                            and hipify_result[s_abs].hipified_path is not None) | ||||
|                           else s_abs) | ||||
|         hipified_s_abs = ( | ||||
|             hipify_result[s_abs].hipified_path | ||||
|             if ( | ||||
|                 s_abs in hipify_result | ||||
|                 and hipify_result[s_abs].hipified_path is not None | ||||
|             ) | ||||
|             else s_abs | ||||
|         ) | ||||
|         hipified_sources.append(hipified_s_abs) | ||||
|  | ||||
|     assert (len(hipified_sources) == len(args.sources)) | ||||
|     assert len(hipified_sources) == len(args.sources) | ||||
|  | ||||
|     # Print hipified source files. | ||||
|     print("\n".join(hipified_sources)) | ||||
|  | ||||
| @ -310,13 +310,13 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR | ||||
|   list(REMOVE_DUPLICATES _PTX_ARCHS) | ||||
|   list(REMOVE_DUPLICATES _SRC_CUDA_ARCHS) | ||||
|  | ||||
|   # if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should | ||||
|   # remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS | ||||
|   # If x.0a or x.0f is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should | ||||
|   # remove x.0a or x.0f from SRC_CUDA_ARCHS and add x.0a or x.0f to _CUDA_ARCHS | ||||
|   set(_CUDA_ARCHS) | ||||
|   foreach(_arch ${_SRC_CUDA_ARCHS}) | ||||
|     if(_arch MATCHES "\\a$") | ||||
|     if(_arch MATCHES "[af]$") | ||||
|       list(REMOVE_ITEM _SRC_CUDA_ARCHS "${_arch}") | ||||
|       string(REPLACE "a" "" _base "${_arch}") | ||||
|       string(REGEX REPLACE "[af]$" "" _base "${_arch}") | ||||
|       if ("${_base}" IN_LIST TGT_CUDA_ARCHS) | ||||
|         list(REMOVE_ITEM _TGT_CUDA_ARCHS "${_base}") | ||||
|         list(APPEND _CUDA_ARCHS "${_arch}") | ||||
|  | ||||
							
								
								
									
										12
									
								
								codecov.yml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										12
									
								
								codecov.yml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,12 @@ | ||||
| codecov: | ||||
|   require_ci_to_pass: false | ||||
|  | ||||
| fixes: | ||||
|   # Map source code paths to repository root paths | ||||
|   # Wildcards match any Python version (python3.*) | ||||
|   - "/vllm-workspace/src/vllm/::vllm/" | ||||
|   - "/vllm-workspace/vllm/::vllm/" | ||||
|   - "/usr/local/lib/python3.*/dist-packages/vllm/::vllm/" | ||||
|   - "/usr/local/lib/python3.*/site-packages/vllm/::vllm/" | ||||
|   - "/usr/lib/python3.*/dist-packages/vllm/::vllm/" | ||||
|   - "/usr/lib/python3.*/site-packages/vllm/::vllm/" | ||||
| @ -28,10 +28,10 @@ | ||||
|  | ||||
| #ifdef USE_ROCM | ||||
|   #include <hip/hip_bf16.h> | ||||
|   #include "../quantization/fp8/amd/quant_utils.cuh" | ||||
|   #include "../quantization/w8a8/fp8/amd/quant_utils.cuh" | ||||
| typedef __hip_bfloat16 __nv_bfloat16; | ||||
| #else | ||||
|   #include "../quantization/fp8/nvidia/quant_utils.cuh" | ||||
|   #include "../quantization/w8a8/fp8/nvidia/quant_utils.cuh" | ||||
| #endif | ||||
|  | ||||
| #define MAX(a, b) ((a) > (b) ? (a) : (b)) | ||||
|  | ||||
| @ -125,32 +125,37 @@ public: | ||||
|   } | ||||
|  | ||||
|   static void set_split_kv (KernelArguments& args) { | ||||
|     // printf("set_split_kv start"); | ||||
|     if (args.split_kv >= 1) return; | ||||
|     auto [H, K, D, B] = args.problem_shape; | ||||
|     // std::cout << H << " " << K << " " << D << " " << B << "\n";       | ||||
|     int sm_count = args.hw_info.sm_count; | ||||
|     // printf("    sm_count = %d\n", sm_count); | ||||
|     int max_splits = ceil_div(K, 128); | ||||
|     max_splits = min(16, max_splits); | ||||
|     float seq_length_k = static_cast<float>(K) / 1024.0f; | ||||
|     int max_splits = 1; | ||||
|  | ||||
|     // TODO: This avoids a hang when the batch size larger than 1 and  | ||||
|     // there is more than 1 kv_splits.  | ||||
|     // Discuss with NVIDIA how this can be fixed. | ||||
|     if (B > 1) { | ||||
|       max_splits = min(1, max_splits); | ||||
|     if (B <= 4 && seq_length_k >= 16) { | ||||
|       max_splits = 16; | ||||
|     } | ||||
|      | ||||
|     // printf("    max_splits = %d\n", max_splits); | ||||
|     else if (B <= 8 && seq_length_k >= 4) { | ||||
|       max_splits = 8; | ||||
|     } | ||||
|     else if ((B <= 16 && seq_length_k >= 8) || | ||||
|              (B == 48 && seq_length_k >= 32)) { | ||||
|       max_splits = 4; | ||||
|     } | ||||
|     else if ((B <= 32 && seq_length_k >= 16) || | ||||
|              (B == 96 && seq_length_k >= 16)) { | ||||
|       max_splits = 2; | ||||
|     } | ||||
|     else { | ||||
|       max_splits = 1; | ||||
|     } | ||||
|  | ||||
|     // Wave-aware scheduling: ensure integer number of waves in K dimension | ||||
|     int sms_per_batch = max(1, sm_count / B); | ||||
|     // printf("    sms_per_batch = %d\n", sms_per_batch); | ||||
|     int split_heur = min(max_splits, sms_per_batch); | ||||
|     int waves = ceil_div(B * split_heur, sm_count); | ||||
|     int k_waves = ceil_div(max_splits, split_heur); | ||||
|     int split_wave_aware = ceil_div(max_splits, k_waves); | ||||
|     args.split_kv = split_wave_aware; | ||||
|     // printf("    args.split_kv = %d\n", args.split_kv); | ||||
|  | ||||
|   } | ||||
|  | ||||
|   /// Determines whether the GEMM can execute the given problem. | ||||
|  | ||||
| @ -580,22 +580,22 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized { | ||||
|       for (; tile_scheduler.is_valid(); ++tile_scheduler) { | ||||
|         auto blk_coord = tile_scheduler.get_block_coord(); | ||||
|         auto problem_shape = params.problem_shape; | ||||
| 	auto local_split_kv = params.split_kv; | ||||
|         auto local_split_kv = params.split_kv; | ||||
|         if (params.mainloop.ptr_seq != nullptr) { | ||||
|           get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)]; | ||||
| 	  if (params.ptr_split_kv != nullptr) { | ||||
|           if (params.ptr_split_kv != nullptr) { | ||||
|             local_split_kv = params.ptr_split_kv[get<2>(blk_coord)]; | ||||
|           } | ||||
|         } | ||||
| 	if (local_split_kv <= get<3>(blk_coord)) | ||||
| 	  continue; | ||||
|         if (local_split_kv <= get<3>(blk_coord)) | ||||
|           continue; | ||||
|         load_page_table( | ||||
|           blk_coord, | ||||
|           problem_shape, | ||||
|           params.mainloop, | ||||
|           shared_storage.tensors, | ||||
|           pipeline_page_table, pipeline_pt_producer_state, | ||||
| 	  local_split_kv | ||||
|           local_split_kv | ||||
|         ); | ||||
|       } | ||||
|     } | ||||
| @ -604,15 +604,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized { | ||||
|         CUTLASS_PRAGMA_NO_UNROLL | ||||
|         for (; tile_scheduler.is_valid(); ++tile_scheduler) { | ||||
|           auto blk_coord = tile_scheduler.get_block_coord(); | ||||
| 	  auto problem_shape = params.problem_shape; | ||||
| 	  auto local_split_kv = params.split_kv; | ||||
|           auto problem_shape = params.problem_shape; | ||||
|           auto local_split_kv = params.split_kv; | ||||
|           if (params.mainloop.ptr_seq != nullptr) { | ||||
|             get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)]; | ||||
| 	    if (params.ptr_split_kv != nullptr) { | ||||
|             if (params.ptr_split_kv != nullptr) { | ||||
|               local_split_kv = params.ptr_split_kv[get<2>(blk_coord)]; | ||||
|             } | ||||
|           } | ||||
| 	  if (local_split_kv <= get<3>(blk_coord)) | ||||
|           if (local_split_kv <= get<3>(blk_coord)) | ||||
|             continue; | ||||
|           load_cpasync( | ||||
|             blk_coord, | ||||
| @ -621,7 +621,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized { | ||||
|             params.mainloop_params, | ||||
|             shared_storage.tensors, | ||||
|             pipeline_load_qk, pipeline_load_qk_producer_state, | ||||
| 	    local_split_kv, | ||||
|             local_split_kv, | ||||
|             /* must be shared pipe */ | ||||
|             pipeline_page_table, pipeline_pt_consumer_state | ||||
|           ); | ||||
| @ -633,15 +633,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized { | ||||
|           CUTLASS_PRAGMA_NO_UNROLL | ||||
|           for (; tile_scheduler.is_valid(); ++tile_scheduler) { | ||||
|             auto blk_coord = tile_scheduler.get_block_coord(); | ||||
| 	    auto problem_shape = params.problem_shape; | ||||
| 	    auto local_split_kv = params.split_kv; | ||||
|             auto problem_shape = params.problem_shape; | ||||
|             auto local_split_kv = params.split_kv; | ||||
|             if (params.mainloop.ptr_seq != nullptr) { | ||||
|               get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)]; | ||||
| 	      if (params.ptr_split_kv != nullptr) { | ||||
| 	        local_split_kv = params.ptr_split_kv[get<2>(blk_coord)]; | ||||
| 	      } | ||||
|               if (params.ptr_split_kv != nullptr) { | ||||
|                 local_split_kv = params.ptr_split_kv[get<2>(blk_coord)]; | ||||
|               } | ||||
|             } | ||||
| 	    if (local_split_kv <= get<3>(blk_coord)) | ||||
|             if (local_split_kv <= get<3>(blk_coord)) | ||||
|               continue; | ||||
|             load_tma</* paged= */ true>( | ||||
|               blk_coord, | ||||
| @ -651,7 +651,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized { | ||||
|               shared_storage.tensors, | ||||
|               pipeline_load_qk, pipeline_load_qk_producer_state, | ||||
|               pipeline_load_qk, pipeline_load_qk_producer_state, | ||||
| 	      local_split_kv | ||||
|               local_split_kv | ||||
|             ); | ||||
|             cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait(); | ||||
|           } | ||||
| @ -660,15 +660,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized { | ||||
|           CUTLASS_PRAGMA_NO_UNROLL | ||||
|           for (; tile_scheduler.is_valid(); ++tile_scheduler) { | ||||
|             auto blk_coord = tile_scheduler.get_block_coord(); | ||||
| 	    auto problem_shape = params.problem_shape; | ||||
| 	    auto local_split_kv = params.split_kv; | ||||
|             auto problem_shape = params.problem_shape; | ||||
|             auto local_split_kv = params.split_kv; | ||||
|             if (params.mainloop.ptr_seq != nullptr) { | ||||
|               get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)]; | ||||
| 	      if (params.ptr_split_kv != nullptr) { | ||||
|               if (params.ptr_split_kv != nullptr) { | ||||
|                 local_split_kv = params.ptr_split_kv[get<2>(blk_coord)]; | ||||
| 	      } | ||||
|               } | ||||
|             } | ||||
| 	    if (local_split_kv <= get<3>(blk_coord)) | ||||
|             if (local_split_kv <= get<3>(blk_coord)) | ||||
|               continue; | ||||
|             load_tma<false>( | ||||
|               blk_coord, | ||||
| @ -678,7 +678,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized { | ||||
|               shared_storage.tensors, | ||||
|               pipeline_load_qk, pipeline_load_qk_producer_state, | ||||
|               pipeline_load_qk, pipeline_load_qk_producer_state, | ||||
| 	      local_split_kv | ||||
|               local_split_kv | ||||
|             ); | ||||
|             cutlass::arch::NamedBarrier((kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, kNamedBarrierEpilogue).arrive_and_wait(); | ||||
|           } | ||||
| @ -694,14 +694,14 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized { | ||||
|         for (; tile_scheduler.is_valid(); ++tile_scheduler) { | ||||
|           auto blk_coord = tile_scheduler.get_block_coord(); | ||||
|           auto problem_shape = params.problem_shape; | ||||
| 	  auto local_split_kv = params.split_kv; | ||||
|           auto local_split_kv = params.split_kv; | ||||
|           if (params.mainloop.ptr_seq != nullptr) { | ||||
|             get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)]; | ||||
|             if (params.ptr_split_kv != nullptr) { | ||||
|                 local_split_kv = params.ptr_split_kv[get<2>(blk_coord)]; | ||||
|             } | ||||
|           } | ||||
| 	  if (local_split_kv <= get<3>(blk_coord)) | ||||
|           if (local_split_kv <= get<3>(blk_coord)) | ||||
|             continue; | ||||
|           mma(blk_coord, | ||||
|             problem_shape, | ||||
| @ -711,7 +711,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized { | ||||
|             pipeline_mma_s, pipeline_mma_s_producer_state, | ||||
|             pipeline_p_mma, pipeline_p_mma_consumer_state, | ||||
|             pipeline_mma_o, pipeline_mma_o_producer_state, | ||||
| 	    local_split_kv | ||||
|             local_split_kv | ||||
|           ); | ||||
|         } | ||||
|       } | ||||
| @ -726,15 +726,15 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized { | ||||
|       for (; tile_scheduler.is_valid(); ++tile_scheduler) { | ||||
|         auto blk_coord = tile_scheduler.get_block_coord(); | ||||
|         auto problem_shape = params.problem_shape; | ||||
| 	auto split_kv = params.split_kv; | ||||
| 	auto local_split_kv = split_kv; | ||||
|         auto split_kv = params.split_kv; | ||||
|         auto local_split_kv = split_kv; | ||||
|         if (params.mainloop.ptr_seq != nullptr) { | ||||
|           get<1>(problem_shape) = params.mainloop.ptr_seq[get<2>(blk_coord)]; | ||||
| 	  if (params.ptr_split_kv != nullptr) { | ||||
|           if (params.ptr_split_kv != nullptr) { | ||||
|             local_split_kv = params.ptr_split_kv[get<2>(blk_coord)]; | ||||
|           } | ||||
|         } | ||||
| 	if (local_split_kv <= get<3>(blk_coord)) | ||||
|         if (local_split_kv <= get<3>(blk_coord)) | ||||
|           continue; | ||||
|         compute( | ||||
|           blk_coord, | ||||
| @ -745,7 +745,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized { | ||||
|           pipeline_mma_s, pipeline_mma_s_consumer_state, | ||||
|           pipeline_p_mma, pipeline_p_mma_producer_state, | ||||
|           pipeline_mma_o, pipeline_mma_o_consumer_state, | ||||
| 	  local_split_kv | ||||
|           local_split_kv | ||||
|         ); | ||||
|       } | ||||
|  | ||||
| @ -1900,7 +1900,7 @@ struct Sm100FmhaMlaKernelTmaWarpspecialized { | ||||
|       cutlass::arch::NamedBarrier( | ||||
|           (kNumComputeWarps + kNumLoadWarps) * NumThreadsPerWarp, | ||||
|           kNamedBarrierEpilogue | ||||
|       ).arrive(); | ||||
|       ).arrive_and_wait(); | ||||
|  | ||||
|       return; | ||||
|     } | ||||
|  | ||||
							
								
								
									
										16
									
								
								csrc/cache.h
									
									
									
									
									
								
							
							
						
						
									
										16
									
								
								csrc/cache.h
									
									
									
									
									
								
							| @ -56,3 +56,19 @@ void cp_gather_cache( | ||||
|     torch::Tensor const& block_table,  // [BATCH, BLOCK_INDICES] | ||||
|     torch::Tensor const& cu_seq_lens,  // [BATCH+1] | ||||
|     int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt); | ||||
|  | ||||
| // Indexer K quantization and cache function | ||||
| void indexer_k_quant_and_cache( | ||||
|     torch::Tensor& k,             // [num_tokens, head_dim] | ||||
|     torch::Tensor& kv_cache,      // [num_blocks, block_size, cache_stride] | ||||
|     torch::Tensor& slot_mapping,  // [num_tokens] | ||||
|     int64_t quant_block_size,     // quantization block size | ||||
|     const std::string& scale_fmt); | ||||
|  | ||||
| // Extract function to gather quantized K cache | ||||
| void cp_gather_indexer_k_quant_cache( | ||||
|     const torch::Tensor& kv_cache,  // [num_blocks, block_size, cache_stride] | ||||
|     torch::Tensor& dst_k,           // [num_tokens, head_dim] | ||||
|     torch::Tensor& dst_scale,  // [num_tokens, head_dim / quant_block_size * 4] | ||||
|     const torch::Tensor& block_table,   // [batch_size, num_blocks] | ||||
|     const torch::Tensor& cu_seq_lens);  // [batch_size + 1] | ||||
| @ -9,15 +9,14 @@ | ||||
| #include "quantization/vectorization_utils.cuh" | ||||
|  | ||||
| #ifdef USE_ROCM | ||||
|   #include "quantization/fp8/amd/quant_utils.cuh" | ||||
|   #include "quantization/w8a8/fp8/amd/quant_utils.cuh" | ||||
| #else | ||||
|   #include "quantization/fp8/nvidia/quant_utils.cuh" | ||||
|   #include "quantization/w8a8/fp8/nvidia/quant_utils.cuh" | ||||
| #endif | ||||
|  | ||||
| #include <algorithm> | ||||
| #include <cassert> | ||||
| #include <map> | ||||
| #include <vector> | ||||
| #include <cfloat> | ||||
|  | ||||
| #ifdef USE_ROCM | ||||
|   #include <hip/hip_bf16.h> | ||||
| @ -209,6 +208,20 @@ void copy_blocks_mla(std::vector<torch::Tensor> const& kv_caches, | ||||
|  | ||||
| namespace vllm { | ||||
|  | ||||
| // Used to copy/convert one element | ||||
| template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt> | ||||
| struct CopyWithScaleOp { | ||||
|   float scale; | ||||
|  | ||||
|   __device__ __forceinline__ void operator()(OutT& dst, const InT src) const { | ||||
|     if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { | ||||
|       dst = static_cast<OutT>(src); | ||||
|     } else { | ||||
|       dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale); | ||||
|     } | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt> | ||||
| __global__ void reshape_and_cache_kernel( | ||||
|     const scalar_t* __restrict__ key,    // [num_tokens, num_heads, head_size] | ||||
| @ -224,59 +237,51 @@ __global__ void reshape_and_cache_kernel( | ||||
|   const int64_t token_idx = blockIdx.x; | ||||
|   const int64_t slot_idx = slot_mapping[token_idx]; | ||||
|   if (slot_idx < 0) { | ||||
|     // Padding token that should be ignored. | ||||
|     return; | ||||
|   } | ||||
|  | ||||
|   const int64_t block_idx = slot_idx / block_size; | ||||
|   const int64_t block_offset = slot_idx % block_size; | ||||
|   const int h_block_count = head_size / x;  // head_size//x | ||||
|  | ||||
|   const int n = num_heads * head_size; | ||||
|   for (int i = threadIdx.x; i < n; i += blockDim.x) { | ||||
|     const int64_t src_key_idx = token_idx * key_stride + i; | ||||
|     const int64_t src_value_idx = token_idx * value_stride + i; | ||||
|   const int h_block_idx = threadIdx.x; | ||||
|   if (h_block_idx >= num_heads * h_block_count) { | ||||
|     return; | ||||
|   } | ||||
|  | ||||
|     const int head_idx = i / head_size; | ||||
|     const int head_offset = i % head_size; | ||||
|     const int x_idx = head_offset / x; | ||||
|     const int x_offset = head_offset % x; | ||||
|   const int head_idx = h_block_idx / h_block_count; | ||||
|   const int h_block = h_block_idx % h_block_count; | ||||
|  | ||||
|     const int64_t tgt_key_idx = | ||||
|         block_idx * num_heads * (head_size / x) * block_size * x + | ||||
|         head_idx * (head_size / x) * block_size * x + x_idx * block_size * x + | ||||
|         block_offset * x + x_offset; | ||||
|     const int64_t tgt_value_idx = | ||||
|         block_idx * num_heads * head_size * block_size + | ||||
|         head_idx * head_size * block_size + head_offset * block_size + | ||||
|         block_offset; | ||||
|     scalar_t tgt_key = key[src_key_idx]; | ||||
|     scalar_t tgt_value = value[src_value_idx]; | ||||
|     if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { | ||||
|       key_cache[tgt_key_idx] = tgt_key; | ||||
|       value_cache[tgt_value_idx] = tgt_value; | ||||
|     } else { | ||||
|       key_cache[tgt_key_idx] = | ||||
|           fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale); | ||||
|       value_cache[tgt_value_idx] = | ||||
|           fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale); | ||||
|     } | ||||
|   const scalar_t* __restrict__ key_src = | ||||
|       key + token_idx * key_stride + head_idx * head_size + h_block * x; | ||||
|   const int64_t src_value_start = | ||||
|       token_idx * value_stride + head_idx * head_size + h_block * x; | ||||
|  | ||||
|   cache_t* __restrict__ key_dst = | ||||
|       key_cache + block_idx * num_heads * h_block_count * block_size * x + | ||||
|       head_idx * h_block_count * block_size * x + h_block * block_size * x + | ||||
|       block_offset * x; | ||||
|   const int64_t tgt_value_start = | ||||
|       block_idx * num_heads * h_block_count * x * block_size + | ||||
|       head_idx * h_block_count * x * block_size + h_block * x * block_size + | ||||
|       block_offset; | ||||
|  | ||||
|   constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4; | ||||
|   float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale; | ||||
|   CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val}; | ||||
|   float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale; | ||||
|   CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val}; | ||||
|  | ||||
|   vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, x, 0, 1, k_op); | ||||
|  | ||||
|   const scalar_t* __restrict__ value_src = value + src_value_start; | ||||
|   cache_t* __restrict__ value_dst = value_cache + tgt_value_start; | ||||
| #pragma unroll | ||||
|   for (int i = 0; i < x; i++) { | ||||
|     v_op(value_dst[i * block_size], value_src[i]); | ||||
|   } | ||||
| } | ||||
|  | ||||
| // Used by vectorization_utils to copy/convert one element | ||||
| template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt> | ||||
| struct CopyWithScaleOp { | ||||
|   float scale; | ||||
|  | ||||
|   __device__ __forceinline__ void operator()(OutT& dst, const InT src) const { | ||||
|     if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) { | ||||
|       dst = static_cast<OutT>(src); | ||||
|     } else { | ||||
|       dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale); | ||||
|     } | ||||
|   } | ||||
| }; | ||||
|  | ||||
| template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt> | ||||
| __global__ void reshape_and_cache_flash_kernel( | ||||
|     const scalar_t* __restrict__ key,    // [num_tokens, num_heads, head_size] | ||||
| @ -396,6 +401,241 @@ __global__ void concat_and_cache_mla_kernel( | ||||
|   copy(k_pe, kv_cache, k_pe_stride, block_stride, pe_dim, kv_lora_rank); | ||||
| } | ||||
|  | ||||
| template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt> | ||||
| __global__ void concat_and_cache_ds_mla_kernel( | ||||
|     const scalar_t* __restrict__ kv_c,  // [num_tokens, kv_lora_rank] | ||||
|     const scalar_t* __restrict__ k_pe,  // [num_tokens, pe_dim] | ||||
|     cache_t* __restrict__ kv_cache,  // [num_blocks, block_size, (kv_lora_rank | ||||
|                                      // + pe_dim)] | ||||
|     const int64_t* __restrict__ slot_mapping,  // [num_tokens] | ||||
|     const int block_stride,                    // | ||||
|     const int entry_stride,                    // | ||||
|     const int kv_c_stride,                     // | ||||
|     const int k_pe_stride,                     // | ||||
|     const int kv_lora_rank,                    // | ||||
|     const int pe_dim,                          // | ||||
|     const int block_size,                      // | ||||
|     const float* scale                         // | ||||
| ) { | ||||
|   const int64_t token_idx = blockIdx.x; | ||||
|   const int64_t slot_idx = slot_mapping[token_idx]; | ||||
|   // NOTE: slot_idx can be -1 if the token is padded | ||||
|   if (slot_idx < 0) { | ||||
|     return; | ||||
|   } | ||||
|   const int64_t block_idx = slot_idx / block_size; | ||||
|   const int64_t block_offset = slot_idx % block_size; | ||||
|   const int64_t dst_idx_start = | ||||
|       block_idx * block_stride + block_offset * entry_stride; | ||||
|  | ||||
|   // For the NoPE part, each tile of 128 elements is handled by half of one warp | ||||
|   // (16 threads). There are 4 total tiles, so 2 warps (64 threads). | ||||
|   // Lanes 0 and 16 of each warp write the scale values for that warp's tiles. | ||||
|   // The RoPE part (last 64 elements) is handled by another 1 warp (32 threads). | ||||
|   // So in total, we use 3 warps (96 threads) per block. | ||||
|  | ||||
|   // Cast kv_cache to 16_bit for RoPE values | ||||
|   scalar_t* kv_cache_16bit = | ||||
|       reinterpret_cast<scalar_t*>(&kv_cache[dst_idx_start]); | ||||
|  | ||||
|   // The last warp handles the RoPE part | ||||
|   if (threadIdx.x >= 64) { | ||||
|     // Each thread handles two elements of RoPE | ||||
|     const int8_t pe_idx_start = (threadIdx.x - 64) * 2; | ||||
|     const int64_t src_idx = token_idx * k_pe_stride + pe_idx_start; | ||||
|     // Vectorized load of two 16-bit values, performed as one 32-bit load | ||||
|     const int32_t vals = *reinterpret_cast<const int32_t*>(&k_pe[src_idx]); | ||||
|     // RoPE values start after the packed 8-bit NoPE values and the | ||||
|     // 32-bit scales | ||||
|     const int64_t dst_idx = kv_lora_rank / 2 + 8 + pe_idx_start; | ||||
|     // Vectorized store of two 16-bit values, performed as one 32-bit store | ||||
|     *reinterpret_cast<int32_t*>(&kv_cache_16bit[dst_idx]) = vals; | ||||
|     return; | ||||
|   } | ||||
|  | ||||
|   // The first two warps handle the NoPE part | ||||
|   const int8_t warp_idx = threadIdx.x >> 5; | ||||
|   const int8_t lane_idx = threadIdx.x & 31; | ||||
|   const int8_t tile_idx = warp_idx * 2 + (lane_idx >> 4); | ||||
|  | ||||
|   // Each thread handles 8 elements of NoPE | ||||
|   // Load the NoPE elements for this thread into registers | ||||
|   const int64_t src_idx_start = token_idx * kv_c_stride + (threadIdx.x * 8); | ||||
|   // Vectorized load of eight 16-bit values, performed as an int4 load | ||||
|   const int4 vals_i4 = *reinterpret_cast<const int4*>(&kv_c[src_idx_start]); | ||||
|   const scalar_t* vals = reinterpret_cast<const scalar_t*>(&vals_i4); | ||||
|  | ||||
|   // Max absolute value of this thread's elements | ||||
|   float max_abs = fmaxf(fmaxf(fmaxf(fabsf(vals[0]), fabsf(vals[1])), | ||||
|                               fmaxf(fabsf(vals[2]), fabsf(vals[3]))), | ||||
|                         fmaxf(fmaxf(fabsf(vals[4]), fabsf(vals[5])), | ||||
|                               fmaxf(fabsf(vals[6]), fabsf(vals[7])))); | ||||
|  | ||||
|   // Warp-level reduction to find the max absolute value in each half-warp | ||||
| #pragma unroll | ||||
|   for (int offset = 8; offset > 0; offset /= 2) { | ||||
|     max_abs = fmaxf(max_abs, VLLM_SHFL_XOR_SYNC_WIDTH(max_abs, offset, 16)); | ||||
|   } | ||||
|  | ||||
|   // Compute the scale for the tile | ||||
|   float tile_scale = max_abs / 448.f; | ||||
|   tile_scale = fmaxf(tile_scale, FLT_MIN); | ||||
|  | ||||
|   // The first lane of each half-warp writes the scale to kv_cache | ||||
|   if ((lane_idx == 0) || (lane_idx == 16)) { | ||||
|     float* kv_cache_32bit = reinterpret_cast<float*>(&kv_cache[dst_idx_start]); | ||||
|     const uint64_t dst_idx = kv_lora_rank / 4 + tile_idx; | ||||
|     kv_cache_32bit[dst_idx] = tile_scale; | ||||
|   } | ||||
|  | ||||
|   // Now all threads in the block scale and write their elements | ||||
|   // NoPE data is packed in the first kv_lora_rank/2 bytes (first 256 bytes) | ||||
|   const int64_t dst_idx_base = dst_idx_start + (threadIdx.x * 8); | ||||
|  | ||||
|   uint8_t result[8]; | ||||
| #pragma unroll | ||||
|   for (int i = 0; i < 8; i++) { | ||||
|     result[i] = | ||||
|         fp8::scaled_convert<uint8_t, scalar_t, Fp8KVCacheDataType::kFp8E4M3>( | ||||
|             vals[i], tile_scale); | ||||
|   } | ||||
|  | ||||
|   // Store as aligned 64-bit writes | ||||
|   *reinterpret_cast<uint64_t*>(&kv_cache[dst_idx_base]) = | ||||
|       *reinterpret_cast<const uint64_t*>(result); | ||||
| } | ||||
|  | ||||
| template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt> | ||||
| __global__ void indexer_k_quant_and_cache_kernel( | ||||
|     const scalar_t* __restrict__ k,  // [num_tokens, head_dim] | ||||
|     cache_t* __restrict__ kv_cache,  // [num_blocks, block_size, cache_stride] | ||||
|     const int64_t* __restrict__ slot_mapping,  // [num_tokens] | ||||
|     const int head_dim,                        // dimension of each head | ||||
|     const int quant_block_size,                // quantization block size | ||||
|     const int cache_block_size,                // cache block size | ||||
|     const int cache_stride,  // stride for each token in kv_cache | ||||
|     const bool use_ue8m0     // use ue8m0 scale format | ||||
| ) { | ||||
|   constexpr int VEC_SIZE = 4; | ||||
|   const int64_t token_idx = blockIdx.x; | ||||
|   const int64_t head_dim_idx = (blockIdx.y * blockDim.y * blockDim.x + | ||||
|                                 threadIdx.y * blockDim.x + threadIdx.x) * | ||||
|                                VEC_SIZE; | ||||
|   const int64_t slot_idx = slot_mapping[token_idx]; | ||||
|   const int64_t block_idx = slot_idx / cache_block_size; | ||||
|   const int64_t block_offset = slot_idx % cache_block_size; | ||||
|  | ||||
|   // NOTE: slot_idx can be -1 if the token is padded | ||||
|   if (slot_idx < 0 || (head_dim_idx >= head_dim)) { | ||||
|     return; | ||||
|   } | ||||
|  | ||||
|   float2 k_val = (reinterpret_cast<const float2*>( | ||||
|       k))[(token_idx * head_dim + head_dim_idx) / VEC_SIZE]; | ||||
|   scalar_t* k_val_ptr = reinterpret_cast<scalar_t*>(&k_val); | ||||
|   float amax = 0.0f; | ||||
|   for (int i = 0; i < VEC_SIZE; i++) { | ||||
|     amax = fmaxf(amax, fabsf(float(k_val_ptr[i]))); | ||||
|   } | ||||
| #ifndef USE_ROCM | ||||
|   __syncwarp(); | ||||
| #endif | ||||
|  | ||||
|   // Reduced amax | ||||
|   for (int mask = 16; mask > 0; mask /= 2) { | ||||
| #ifdef USE_ROCM | ||||
|     amax = fmaxf(amax, __shfl_xor_sync(uint64_t(-1), amax, mask)); | ||||
| #else | ||||
|     amax = fmaxf(amax, __shfl_xor_sync(unsigned(-1), amax, mask)); | ||||
| #endif | ||||
|   } | ||||
| #ifndef USE_ROCM | ||||
|   __syncwarp(); | ||||
| #endif | ||||
|   float scale = fmaxf(amax, 1e-4) / 448.0f; | ||||
|   if (use_ue8m0) { | ||||
|     scale = exp2f(ceilf(log2f(scale))); | ||||
|   } | ||||
|  | ||||
|   const int64_t dst_offset = block_idx * cache_block_size * cache_stride + | ||||
|                              block_offset * head_dim + head_dim_idx; | ||||
|   for (int i = 0; i < VEC_SIZE; i++) { | ||||
|     kv_cache[dst_offset + i] = | ||||
|         fp8::scaled_convert<cache_t, scalar_t, kv_dt>(k_val_ptr[i], scale); | ||||
|   } | ||||
|   if (threadIdx.x == 0) { | ||||
|     const int64_t dst_scale_idx = | ||||
|         block_idx * cache_block_size * cache_stride + | ||||
|         cache_block_size * head_dim + | ||||
|         (block_offset * head_dim + head_dim_idx) * 4 / quant_block_size; | ||||
|     reinterpret_cast<float*>(kv_cache)[dst_scale_idx / 4] = scale; | ||||
|   } | ||||
| } | ||||
|  | ||||
| template <int BLOCK_Y_SIZE> | ||||
| __global__ void cp_gather_indexer_k_quant_cache_kernel( | ||||
|     const char* __restrict__ kv_cache,  // [num_blocks, block_size, | ||||
|                                         // cache_stride] | ||||
|     char* __restrict__ dst_k,           // [num_tokens, head_dim] | ||||
|     char* __restrict__ dst_scale,  // [num_tokens, head_dim / quant_block_size * | ||||
|                                    // 4] | ||||
|     const int* __restrict__ block_table,  // [batch_size, num_blocks] | ||||
|     const int* __restrict__ cu_seq_lens,  // [batch_size + 1] | ||||
|     const int batch_size,                 // batch size | ||||
|     const int64_t token_stride,           // stride for each token in dst_k | ||||
|     const int64_t head_dim,               // dimension of each head | ||||
|     const int64_t block_stride,           // stride for each block in kv_cache | ||||
|     const int64_t cache_token_stride,     // stride for each token in kv_cache | ||||
|     const int64_t cache_block_size,  // num_tokens for each block in kv_cache | ||||
|     const int num_blocks,            // number of blocks | ||||
|     const int num_tokens,            // number of tokens | ||||
|     const int quant_block_size       // quantization block size | ||||
| ) { | ||||
|   constexpr int VEC_SIZE = sizeof(float4) / sizeof(char); | ||||
|   const int token_idx = blockIdx.x * blockDim.y + threadIdx.y; | ||||
|   const int head_idx = (blockIdx.y * blockDim.x + threadIdx.x) * VEC_SIZE; | ||||
|   // Find batch index within a block | ||||
|   __shared__ int batch_idx[BLOCK_Y_SIZE]; | ||||
|   for (int iter = 0; iter < cuda_utils::ceil_div(batch_size, int(blockDim.x)); | ||||
|        iter++) { | ||||
|     int tid = iter * blockDim.x + threadIdx.x; | ||||
|     if (tid < batch_size) { | ||||
|       const int seq_start = cu_seq_lens[tid]; | ||||
|       const int seq_end = cu_seq_lens[tid + 1]; | ||||
|       if (token_idx >= seq_start && token_idx < seq_end) { | ||||
|         batch_idx[threadIdx.y] = tid; | ||||
|       } | ||||
|     } | ||||
|   } | ||||
|  | ||||
| #ifndef USE_ROCM | ||||
|   __syncwarp(); | ||||
| #endif | ||||
|  | ||||
|   if (head_idx >= head_dim || token_idx >= num_tokens) { | ||||
|     return; | ||||
|   } | ||||
|   const int inbatch_seq_idx = token_idx - cu_seq_lens[batch_idx[threadIdx.y]]; | ||||
|   const int block_idx = block_table[batch_idx[threadIdx.y] * num_blocks + | ||||
|                                     inbatch_seq_idx / cache_block_size]; | ||||
|   const int64_t src_block_offset = block_idx * block_stride; | ||||
|   const int64_t cache_inblock_offset = | ||||
|       (inbatch_seq_idx % cache_block_size) * head_dim + head_idx; | ||||
|   const int64_t src_inblock_offset = src_block_offset + cache_inblock_offset; | ||||
|   const int64_t dst_inblock_offset = token_idx * token_stride + head_idx; | ||||
|  | ||||
|   reinterpret_cast<float4*>(dst_k)[dst_inblock_offset / VEC_SIZE] = | ||||
|       reinterpret_cast<const float4*>(kv_cache)[src_inblock_offset / VEC_SIZE]; | ||||
|   ; | ||||
|   if (threadIdx.x == 0) { | ||||
|     const int64_t src_scale_offset = | ||||
|         src_block_offset + cache_block_size * head_dim + | ||||
|         cache_inblock_offset * 4 / quant_block_size; | ||||
|     reinterpret_cast<float*>(dst_scale)[dst_inblock_offset / quant_block_size] = | ||||
|         reinterpret_cast<const float*>(kv_cache)[src_scale_offset / 4]; | ||||
|   } | ||||
| } | ||||
|  | ||||
| }  // namespace vllm | ||||
|  | ||||
| // KV_T is the data type of key and value tensors. | ||||
| @ -431,14 +671,15 @@ void reshape_and_cache( | ||||
|  | ||||
|   int key_stride = key.stride(0); | ||||
|   int value_stride = value.stride(0); | ||||
|   int head_div_x = head_size / x; | ||||
|  | ||||
|   dim3 grid(num_tokens); | ||||
|   dim3 block(std::min(num_heads * head_size, 512)); | ||||
|   dim3 block(std::min(num_heads * head_div_x, 512)); | ||||
|   const at::cuda::OptionalCUDAGuard device_guard(device_of(key)); | ||||
|   const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); | ||||
|  | ||||
|   DISPATCH_BY_KV_CACHE_DTYPE(key.dtype(), kv_cache_dtype, | ||||
|                              CALL_RESHAPE_AND_CACHE) | ||||
|                              CALL_RESHAPE_AND_CACHE); | ||||
| } | ||||
|  | ||||
| // KV_T is the data type of key and value tensors. | ||||
| @ -509,6 +750,18 @@ void reshape_and_cache_flash( | ||||
|           kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \ | ||||
|           reinterpret_cast<const float*>(scale.data_ptr())); | ||||
|  | ||||
| // KV_T is the data type of key and value tensors. | ||||
| // CACHE_T is the stored data type of kv-cache. | ||||
| #define CALL_CONCAT_AND_CACHE_DS_MLA(KV_T, CACHE_T, KV_DTYPE)           \ | ||||
|   vllm::concat_and_cache_ds_mla_kernel<KV_T, CACHE_T, KV_DTYPE>         \ | ||||
|       <<<grid, block, 0, stream>>>(                                     \ | ||||
|           reinterpret_cast<KV_T*>(kv_c.data_ptr()),                     \ | ||||
|           reinterpret_cast<KV_T*>(k_pe.data_ptr()),                     \ | ||||
|           reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),              \ | ||||
|           slot_mapping.data_ptr<int64_t>(), block_stride, entry_stride, \ | ||||
|           kv_c_stride, k_pe_stride, kv_lora_rank, pe_dim, block_size,   \ | ||||
|           reinterpret_cast<const float*>(scale.data_ptr())); | ||||
|  | ||||
| void concat_and_cache_mla( | ||||
|     torch::Tensor& kv_c,          // [num_tokens, kv_lora_rank] | ||||
|     torch::Tensor& k_pe,          // [num_tokens, pe_dim] | ||||
| @ -531,20 +784,43 @@ void concat_and_cache_mla( | ||||
|   int pe_dim = k_pe.size(1); | ||||
|   int block_size = kv_cache.size(1); | ||||
|  | ||||
|   TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim); | ||||
|   if (kv_cache_dtype == "fp8_ds_mla") { | ||||
|     TORCH_CHECK(kv_lora_rank == 512, "kv_lora_rank must be 512 for fp8_ds_mla"); | ||||
|     TORCH_CHECK(pe_dim == 64, "pe_dim must be 64 for fp8_ds_mla"); | ||||
|     TORCH_CHECK(kv_cache.size(2) == 656 / kv_cache.itemsize(), | ||||
|                 "kv_cache.size(2) must be 656 bytes for fp8_ds_mla"); | ||||
|     TORCH_CHECK(kv_c.itemsize() == 2, | ||||
|                 "kv_c.itemsize() must be 2 for fp8_ds_mla"); | ||||
|     TORCH_CHECK(k_pe.itemsize() == 2, | ||||
|                 "k_pe.itemsize() must be 2 for fp8_ds_mla"); | ||||
|   } else { | ||||
|     TORCH_CHECK(kv_cache.size(2) == kv_lora_rank + pe_dim); | ||||
|   } | ||||
|  | ||||
|   int kv_c_stride = kv_c.stride(0); | ||||
|   int k_pe_stride = k_pe.stride(0); | ||||
|   int block_stride = kv_cache.stride(0); | ||||
|   int entry_stride = kv_cache.stride(1); | ||||
|  | ||||
|   dim3 grid(num_tokens); | ||||
|   dim3 block(std::min(kv_lora_rank, 512)); | ||||
|   const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_c)); | ||||
|   const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); | ||||
|  | ||||
|   DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype, | ||||
|                              CALL_CONCAT_AND_CACHE_MLA); | ||||
|   if (kv_cache_dtype == "fp8_ds_mla") { | ||||
|     dim3 grid(num_tokens); | ||||
|     // For the NoPE part, each tile of 128 elements is handled by half of one | ||||
|     // warp (16 threads). There are 4 total tiles, so 2 warps (64 threads). | ||||
|     // Lanes 0 and 16 of each warp write the scale values for that warp's tiles. | ||||
|     // The RoPE part (last 64 elements) is handled by another 1 warp (32 | ||||
|     // threads). So in total, we use 3 warps (96 threads) per block. | ||||
|     dim3 block(96); | ||||
|     DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype, | ||||
|                                CALL_CONCAT_AND_CACHE_DS_MLA); | ||||
|   } else { | ||||
|     dim3 grid(num_tokens); | ||||
|     dim3 block(std::min(kv_lora_rank, 512)); | ||||
|     DISPATCH_BY_KV_CACHE_DTYPE(kv_c.dtype(), kv_cache_dtype, | ||||
|                                CALL_CONCAT_AND_CACHE_MLA); | ||||
|   } | ||||
| } | ||||
|  | ||||
| namespace vllm { | ||||
| @ -922,3 +1198,98 @@ void cp_gather_cache( | ||||
|     TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits); | ||||
|   } | ||||
| } | ||||
|  | ||||
| // Macro to dispatch the kernel based on the data type. | ||||
| #define CALL_INDEXER_K_QUANT_AND_CACHE(KV_T, CACHE_T, KV_DTYPE)         \ | ||||
|   vllm::indexer_k_quant_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE>       \ | ||||
|       <<<grid, block, 0, stream>>>(                                     \ | ||||
|           reinterpret_cast<KV_T*>(k.data_ptr()),                        \ | ||||
|           reinterpret_cast<CACHE_T*>(kv_cache.data_ptr()),              \ | ||||
|           slot_mapping.data_ptr<int64_t>(), head_dim, quant_block_size, \ | ||||
|           cache_block_size, cache_stride, use_ue8m0); | ||||
|  | ||||
| void indexer_k_quant_and_cache( | ||||
|     torch::Tensor& k,             // [num_tokens, head_dim] | ||||
|     torch::Tensor& kv_cache,      // [num_blocks, block_size, cache_stride] | ||||
|     torch::Tensor& slot_mapping,  // [num_tokens] | ||||
|     int64_t quant_block_size,     // quantization block size | ||||
|     const std::string& scale_fmt) { | ||||
|   int num_tokens = k.size(0); | ||||
|   int head_dim = k.size(1); | ||||
|   int cache_block_size = kv_cache.size(1); | ||||
|   int cache_stride = kv_cache.size(2); | ||||
|   bool use_ue8m0 = scale_fmt == "ue8m0"; | ||||
|  | ||||
|   TORCH_CHECK(k.device() == kv_cache.device(), | ||||
|               "k and kv_cache must be on the same device"); | ||||
|   TORCH_CHECK(k.device() == slot_mapping.device(), | ||||
|               "k and slot_mapping must be on the same device"); | ||||
|   TORCH_CHECK(head_dim % quant_block_size == 0, | ||||
|               "head_dim must be divisible by quant_block_size"); | ||||
|  | ||||
|   constexpr int vec_size = 4; | ||||
|   dim3 grid(num_tokens, (head_dim + quant_block_size * vec_size - 1) / | ||||
|                             (quant_block_size * vec_size)); | ||||
|   dim3 block(32, vec_size); | ||||
|   const at::cuda::OptionalCUDAGuard device_guard(device_of(k)); | ||||
|   const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); | ||||
|  | ||||
|   DISPATCH_BY_KV_CACHE_DTYPE(k.dtype(), "fp8_e4m3", | ||||
|                              CALL_INDEXER_K_QUANT_AND_CACHE); | ||||
| } | ||||
|  | ||||
| // Macro to dispatch the kernel based on the data amount. | ||||
| #define CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(BLOCK_Y_SIZE)                  \ | ||||
|   vllm::cp_gather_indexer_k_quant_cache_kernel<BLOCK_Y_SIZE>                \ | ||||
|       <<<dim3((num_tokens + BLOCK_Y_SIZE - 1) / BLOCK_Y_SIZE,               \ | ||||
|               (head_dim + 8 * vec_size - 1) / (8 * vec_size)),              \ | ||||
|          dim3(8, BLOCK_Y_SIZE), 0, stream>>>(                               \ | ||||
|           reinterpret_cast<char*>(kv_cache.data_ptr()),                     \ | ||||
|           reinterpret_cast<char*>(dst_k.data_ptr()),                        \ | ||||
|           reinterpret_cast<char*>(dst_scale.data_ptr()),                    \ | ||||
|           block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \ | ||||
|           batch_size, dst_k.stride(0), dst_k.size(1), kv_cache.stride(0),   \ | ||||
|           kv_cache.stride(1), kv_cache.size(1), block_table.size(1),        \ | ||||
|           num_tokens, quant_block_size); | ||||
|  | ||||
| void cp_gather_indexer_k_quant_cache( | ||||
|     const torch::Tensor& kv_cache,  // [num_blocks, block_size, cache_stride] | ||||
|     torch::Tensor& dst_k,           // [num_tokens, head_dim] | ||||
|     torch::Tensor& dst_scale,  // [num_tokens, head_dim / quant_block_size * 4] | ||||
|     const torch::Tensor& block_table,  // [batch_size, num_blocks] | ||||
|     const torch::Tensor& cu_seq_lens   // [batch_size + 1] | ||||
| ) { | ||||
|   int batch_size = block_table.size(0); | ||||
|   int num_tokens = dst_k.size(0); | ||||
|   int head_dim = dst_k.size(1); | ||||
|   int quant_block_size = head_dim * 4 / dst_scale.size(1); | ||||
|  | ||||
|   TORCH_CHECK(kv_cache.device() == dst_k.device(), | ||||
|               "kv_cache and dst_k must be on the same device"); | ||||
|   TORCH_CHECK(kv_cache.device() == dst_scale.device(), | ||||
|               "kv_cache and dst_scale must be on the same device"); | ||||
|   TORCH_CHECK(kv_cache.device() == block_table.device(), | ||||
|               "kv_cache and block_table must be on the same device"); | ||||
|   TORCH_CHECK(kv_cache.device() == cu_seq_lens.device(), | ||||
|               "kv_cache and cu_seq_lens must be on the same device"); | ||||
|   TORCH_CHECK(head_dim % quant_block_size == 0, | ||||
|               "head_dim must be divisible by quant_block_size"); | ||||
|  | ||||
|   constexpr int vec_size = 16; | ||||
|   const at::cuda::OptionalCUDAGuard device_guard(device_of(kv_cache)); | ||||
|   const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); | ||||
|  | ||||
|   if (num_tokens < 32) { | ||||
|     CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(1); | ||||
|   } else if (num_tokens < 64) { | ||||
|     CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(2); | ||||
|   } else if (num_tokens < 128) { | ||||
|     CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(4); | ||||
|   } else if (num_tokens < 256) { | ||||
|     CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(8); | ||||
|   } else if (num_tokens < 512) { | ||||
|     CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(16); | ||||
|   } else { | ||||
|     CALL_CP_GATHER_INDEXER_K_QUANT_CACHE(32); | ||||
|   } | ||||
| } | ||||
|  | ||||
							
								
								
									
										19
									
								
								csrc/core/batch_invariant.hpp
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										19
									
								
								csrc/core/batch_invariant.hpp
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,19 @@ | ||||
| #pragma once | ||||
| #include <cstdlib> | ||||
| #include <string> | ||||
| #include <cctype> | ||||
|  | ||||
| namespace vllm { | ||||
|  | ||||
| // vllm_is_batch_invariant(); returns true | ||||
| // if env VLLM_BATCH_INVARIANT=1 | ||||
| inline bool vllm_is_batch_invariant() { | ||||
|   static bool cached = []() { | ||||
|     std::string env_key = "VLLM_BATCH_INVARIANT"; | ||||
|     const char* val = std::getenv(env_key.c_str()); | ||||
|     return (val && std::atoi(val) != 0) ? 1 : 0; | ||||
|   }(); | ||||
|   return cached; | ||||
| } | ||||
|  | ||||
| }  // namespace vllm | ||||
| @ -137,9 +137,8 @@ DNNLMatMulPrimitiveHandler::DNNLMatMulPrimitiveHandler( | ||||
| } | ||||
|  | ||||
| void DNNLMatMulPrimitiveHandler::prepack_weight( | ||||
|     void* original_b_ptr, dnnl::memory::desc b_target_mem_desc) { | ||||
|   dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_, | ||||
|                                    {b_k_stride_, b_n_stride_}); | ||||
|     void* original_b_ptr, dnnl::memory::desc original_b_md, | ||||
|     dnnl::memory::desc b_target_mem_desc) { | ||||
|   dnnl::memory original_weight(original_b_md, default_engine(), original_b_ptr); | ||||
|   dnnl::memory packed_weight(b_target_mem_desc, default_engine()); | ||||
|   { | ||||
| @ -250,7 +249,9 @@ W8A8MatMulPrimitiveHandler::W8A8MatMulPrimitiveHandler(const Args& args) | ||||
|   if (a_qs_ == QuantizationStrategy::PER_TOKEN) { | ||||
|     assert(!use_azp_); | ||||
|   }; | ||||
|   prepack_weight(args.b_ptr, | ||||
|   dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_, | ||||
|                                    {b_k_stride_, b_n_stride_}); | ||||
|   prepack_weight(args.b_ptr, original_b_md, | ||||
|                  create_primitive_desc( | ||||
|                      MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL, | ||||
|                                    .use_bias = false, | ||||
| @ -412,12 +413,25 @@ MatMulPrimitiveHandler::MatMulPrimitiveHandler(const Args& args) | ||||
|   assert(ab_type_ == dnnl::memory::data_type::f32 || | ||||
|          ab_type_ == dnnl::memory::data_type::bf16 || | ||||
|          ab_type_ == dnnl::memory::data_type::f16); | ||||
|   prepack_weight(args.b_ptr, | ||||
|  | ||||
|   dnnl::memory::desc original_b_md({b_k_size_, b_n_size_}, b_type_, | ||||
|                                    {b_k_stride_, b_n_stride_}); | ||||
|  | ||||
|   prepack_weight(args.b_ptr, original_b_md, | ||||
|                  create_primitive_desc( | ||||
|                      MSizeCacheKey{.a_m_size = DNNL_RUNTIME_DIM_VAL, | ||||
|                                    .a_m_stride = DNNL_RUNTIME_DIM_VAL, | ||||
|                                    .use_bias = false, | ||||
|                                    .bias_type = dnnl::memory::data_type::undef}, | ||||
|                      MSizeCacheKey{ | ||||
| #ifdef VLLM_USE_ACL | ||||
|                          // Arm Compute Library (ACL) backend for oneDNN does | ||||
|                          // not support runtime | ||||
|                          // dimensions, so we set M to a default value | ||||
|                          .a_m_size = 128, | ||||
|                          .a_m_stride = b_k_size_, | ||||
| #else | ||||
|                          .a_m_size = DNNL_RUNTIME_DIM_VAL, | ||||
|                          .a_m_stride = DNNL_RUNTIME_DIM_VAL, | ||||
| #endif | ||||
|                          .use_bias = false, | ||||
|                          .bias_type = dnnl::memory::data_type::undef}, | ||||
|                      true) | ||||
|                      .weights_desc()); | ||||
|   init_runtime_memory_cache(args); | ||||
| @ -443,13 +457,31 @@ void MatMulPrimitiveHandler::execute(ExecArgs& args) { | ||||
|   c_storage->set_data_handle((void*)args.c_ptr); | ||||
|   c_mem_desc->dims[0] = args.a_m_size; | ||||
|  | ||||
| #ifndef VLLM_USE_ACL | ||||
|   // We do not support in ACL backend of oneDNN, we handle bias by: | ||||
|   // 1. copying it into the result tensor | ||||
|   // 2. attaching a fused-sum post-op to the matmul primitive | ||||
|   if (args.use_bias) { | ||||
|     auto&& [bias_storage, bias_mem_desc] = get_runtime_memory_ptr(2); | ||||
|     bias_storage->set_data_handle((void*)args.bias_ptr); | ||||
|   } | ||||
|  | ||||
| #endif | ||||
|   dnnl::matmul matmul = get_matmul_cache(args); | ||||
|  | ||||
| // With ACL backend of oneDNN, the required memory format might change when the | ||||
| // source tensor dims change. This does not really happen in practice, so isn't | ||||
| // a performance hit, but we need to support it because the API allows for it. | ||||
| #ifdef VLLM_USE_ACL | ||||
|   auto new_expected_wei_desc = | ||||
|       dnnl::matmul::primitive_desc( | ||||
|           const_cast<dnnl_primitive_desc_t>(matmul.get_primitive_desc())) | ||||
|           .weights_desc(); | ||||
|   if (new_expected_wei_desc != b_target_mem_desc_) { | ||||
|     prepack_weight(memory_cache_[DNNL_ARG_WEIGHTS].get_data_handle(), | ||||
|                    b_target_mem_desc_, new_expected_wei_desc); | ||||
|   } | ||||
| #endif | ||||
|  | ||||
|   auto&& [scratchpad_storage, scratchpad_mem_desc] = get_runtime_memory_ptr(3); | ||||
|   scratchpad_storage->set_data_handle( | ||||
|       DNNLScratchPadManager::get_dnnl_scratchpad_manager()->get_data<void>()); | ||||
| @ -484,7 +516,13 @@ dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc( | ||||
|   } else { | ||||
|     a_md = dnnl::memory::desc({key.a_m_size, b_k_size_}, b_type_, | ||||
|                               {key.a_m_stride, 1}); | ||||
| #ifdef VLLM_USE_ACL | ||||
|     // ACL's backend of oneDNN always expects the weight format to be "any" | ||||
|     b_md = dnnl::memory::desc({b_k_size_, b_n_size_}, b_type_, | ||||
|                               dnnl::memory::format_tag::any); | ||||
| #else | ||||
|     b_md = b_target_mem_desc_; | ||||
| #endif | ||||
|   } | ||||
|   dnnl::memory::desc c_md({key.a_m_size, b_n_size_}, c_type_, | ||||
|                           dnnl::memory::format_tag::ab); | ||||
| @ -494,8 +532,18 @@ dnnl::matmul::primitive_desc MatMulPrimitiveHandler::create_primitive_desc( | ||||
|  | ||||
|   if (key.use_bias) { | ||||
|     dnnl::memory::desc bias_md({1, b_n_size_}, key.bias_type, {b_n_size_, 1}); | ||||
| // Since ACL's matmuls don't support passing a bias_md, we apply the bias | ||||
| // through a fused-sum post-op | ||||
| #ifdef VLLM_USE_ACL | ||||
|     dnnl::post_ops post_ops; | ||||
|     post_ops.append_sum(); | ||||
|     attr.set_post_ops(post_ops); | ||||
|     return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md, | ||||
|                                         attr); | ||||
| #else | ||||
|     return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, bias_md, | ||||
|                                         c_md, attr); | ||||
| #endif | ||||
|   } else { | ||||
|     return dnnl::matmul::primitive_desc(default_engine(), a_md, b_md, c_md, | ||||
|                                         attr); | ||||
| @ -511,13 +559,23 @@ void MatMulPrimitiveHandler::init_runtime_memory_cache(const Args& args) { | ||||
|                    default_engine(), nullptr); | ||||
|   set_runtime_memory_ptr(1, memory_cache_[DNNL_ARG_DST].get()); | ||||
|  | ||||
| // ACL matmuls don't support bias_md, so we don't need these | ||||
| #ifndef VLLM_USE_ACL | ||||
|   memory_cache_[DNNL_ARG_BIAS] = | ||||
|       dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}}, | ||||
|                    default_engine(), nullptr); | ||||
|   set_runtime_memory_ptr(2, memory_cache_[DNNL_ARG_BIAS].get()); | ||||
|  | ||||
| #endif | ||||
|   memory_cache_[DNNL_ARG_SCRATCHPAD] = | ||||
|       dnnl::memory({{b_n_size_}, dnnl::memory::data_type::f32, {1}}, | ||||
|                    default_engine(), nullptr); | ||||
|   set_runtime_memory_ptr(3, memory_cache_[DNNL_ARG_SCRATCHPAD].get()); | ||||
| } | ||||
|  | ||||
| bool is_onednn_acl_supported() { | ||||
| #ifdef VLLM_USE_ACL | ||||
|   return true; | ||||
| #else | ||||
|   return false; | ||||
| #endif | ||||
| } | ||||
|  | ||||
| @ -101,7 +101,7 @@ class DNNLMatMulPrimitiveHandler { | ||||
|  protected: | ||||
|   DNNLMatMulPrimitiveHandler(const Args& args, dnnl::memory::data_type b_type); | ||||
|  | ||||
|   void prepack_weight(void* original_b_ptr, | ||||
|   void prepack_weight(void* original_b_ptr, dnnl::memory::desc original_b_md, | ||||
|                       dnnl::memory::desc b_target_mem_desc); | ||||
|  | ||||
|   void set_runtime_memory_ptr(size_t index, dnnl_memory* memory_ptr); | ||||
|  | ||||
| @ -527,21 +527,42 @@ void onednn_mm(torch::Tensor& c,        // [M, OC], row-major | ||||
|   MatMulPrimitiveHandler* ptr = | ||||
|       reinterpret_cast<MatMulPrimitiveHandler*>(handler); | ||||
|  | ||||
| // ACL matmuls expect contiguous source tensors | ||||
| #ifdef VLLM_USE_ACL | ||||
|   torch::Tensor a_contig = a.contiguous(); | ||||
| #endif | ||||
|  | ||||
|   MatMulPrimitiveHandler::ExecArgs exec_args; | ||||
|  | ||||
| #ifdef VLLM_USE_ACL | ||||
|   exec_args.a_m_size = a_contig.size(0); | ||||
|   exec_args.a_m_stride = a_contig.stride(0); | ||||
| #else | ||||
|   exec_args.a_m_size = a.size(0); | ||||
|   exec_args.a_m_stride = a.stride(0); | ||||
|  | ||||
| #endif | ||||
|   VLLM_DISPATCH_FLOATING_TYPES(a.scalar_type(), "onednn_mm", [&] { | ||||
|     if (bias.has_value()) { | ||||
|       exec_args.use_bias = true; | ||||
|       exec_args.bias_type = get_dnnl_type<scalar_t>(); | ||||
| #ifdef VLLM_USE_ACL | ||||
|       // ACL matmuls in oneDNN do not support a bias. | ||||
|       // We handle a matmul with bias by doing: c = bias; c += matmul(a, b) | ||||
|       c.copy_(bias.value()); | ||||
| #else | ||||
|       exec_args.bias_ptr = bias->data_ptr<scalar_t>(); | ||||
| #endif | ||||
|     } else { | ||||
|       exec_args.use_bias = false; | ||||
|       exec_args.bias_type = get_dnnl_type<void>(); | ||||
|       exec_args.bias_ptr = nullptr; | ||||
|     } | ||||
| #ifdef VLLM_USE_ACL | ||||
|     exec_args.a_ptr = a_contig.data_ptr<scalar_t>(); | ||||
| #else | ||||
|     exec_args.a_ptr = a.data_ptr<scalar_t>(); | ||||
|  | ||||
| #endif | ||||
|     exec_args.c_ptr = c.data_ptr<scalar_t>(); | ||||
|  | ||||
|     ptr->execute(exec_args); | ||||
|  | ||||
| @ -27,6 +27,8 @@ int64_t create_onednn_mm_handler(const torch::Tensor& b, | ||||
| void onednn_mm(torch::Tensor& c, const torch::Tensor& a, | ||||
|                const std::optional<torch::Tensor>& bias, int64_t handler); | ||||
|  | ||||
| bool is_onednn_acl_supported(); | ||||
|  | ||||
| void mla_decode_kvcache(torch::Tensor& out, torch::Tensor& query, | ||||
|                         torch::Tensor& kv_cache, double scale, | ||||
|                         torch::Tensor& block_tables, torch::Tensor& seq_lens); | ||||
| @ -181,6 +183,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { | ||||
|       "int handler) -> ()"); | ||||
|   ops.impl("onednn_mm", torch::kCPU, &onednn_mm); | ||||
|  | ||||
|   // Check if oneDNN was built with ACL backend | ||||
|   ops.def("is_onednn_acl_supported() -> bool", &is_onednn_acl_supported); | ||||
|  | ||||
|   // Create oneDNN W8A8 handler | ||||
|   ops.def( | ||||
|       "create_onednn_scaled_mm_handler(Tensor b, Tensor b_scales, ScalarType " | ||||
|  | ||||
| @ -12,6 +12,7 @@ using CubMaxOp = cub::Max; | ||||
|   #endif  // CUB_VERSION | ||||
| #else | ||||
|   #include <hipcub/hipcub.hpp> | ||||
| using CubAddOp = cub::Sum; | ||||
| using CubMaxOp = cub::Max; | ||||
| namespace cub = hipcub; | ||||
| using CubAddOp = hipcub::Sum; | ||||
| using CubMaxOp = hipcub::Max; | ||||
| #endif  // USE_ROCM | ||||
|  | ||||
| @ -2,7 +2,6 @@ | ||||
| # SPDX-FileCopyrightText: Copyright contributors to the vLLM project | ||||
|  | ||||
| import enum | ||||
| from typing import Union | ||||
|  | ||||
| from cutlass_library import * | ||||
|  | ||||
| @ -22,31 +21,31 @@ class MixedInputKernelScheduleType(enum.Enum): | ||||
|     TmaWarpSpecializedCooperative = enum_auto() | ||||
|  | ||||
|  | ||||
| VLLMDataTypeNames: dict[Union[VLLMDataType, DataType], str] = { | ||||
| VLLMDataTypeNames: dict[VLLMDataType | DataType, str] = { | ||||
|     **DataTypeNames,  # type: ignore | ||||
|     **{ | ||||
|         VLLMDataType.u4b8: "u4b8", | ||||
|         VLLMDataType.u8b128: "u8b128", | ||||
|     } | ||||
|     }, | ||||
| } | ||||
|  | ||||
| VLLMDataTypeTag: dict[Union[VLLMDataType, DataType], str] = { | ||||
| VLLMDataTypeTag: dict[VLLMDataType | DataType, str] = { | ||||
|     **DataTypeTag,  # type: ignore | ||||
|     **{ | ||||
|         VLLMDataType.u4b8: "cutlass::vllm_uint4b8_t", | ||||
|         VLLMDataType.u8b128: "cutlass::vllm_uint8b128_t", | ||||
|     } | ||||
|     }, | ||||
| } | ||||
|  | ||||
| VLLMDataTypeSize: dict[Union[VLLMDataType, DataType], int] = { | ||||
| VLLMDataTypeSize: dict[VLLMDataType | DataType, int] = { | ||||
|     **DataTypeSize,  # type: ignore | ||||
|     **{ | ||||
|         VLLMDataType.u4b8: 4, | ||||
|         VLLMDataType.u8b128: 8, | ||||
|     } | ||||
|     }, | ||||
| } | ||||
|  | ||||
| VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = { | ||||
| VLLMDataTypeVLLMScalarTypeTag: dict[VLLMDataType | DataType, str] = { | ||||
|     VLLMDataType.u4b8: "vllm::kU4B8", | ||||
|     VLLMDataType.u8b128: "vllm::kU8B128", | ||||
|     DataType.u4: "vllm::kU4", | ||||
| @ -57,7 +56,7 @@ VLLMDataTypeVLLMScalarTypeTag: dict[Union[VLLMDataType, DataType], str] = { | ||||
|     DataType.bf16: "vllm::kBfloat16", | ||||
| } | ||||
|  | ||||
| VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = { | ||||
| VLLMDataTypeTorchDataTypeTag: dict[VLLMDataType | DataType, str] = { | ||||
|     DataType.u8: "at::ScalarType::Byte", | ||||
|     DataType.s8: "at::ScalarType::Char", | ||||
|     DataType.e4m3: "at::ScalarType::Float8_e4m3fn", | ||||
| @ -67,15 +66,11 @@ VLLMDataTypeTorchDataTypeTag: dict[Union[VLLMDataType, DataType], str] = { | ||||
|     DataType.f32: "at::ScalarType::Float", | ||||
| } | ||||
|  | ||||
| VLLMKernelScheduleTag: dict[Union[ | ||||
|     MixedInputKernelScheduleType, KernelScheduleType], str] = { | ||||
|         **KernelScheduleTag,  # type: ignore | ||||
|         **{ | ||||
|             MixedInputKernelScheduleType.TmaWarpSpecialized: | ||||
|             "cutlass::gemm::KernelTmaWarpSpecialized", | ||||
|             MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: | ||||
|             "cutlass::gemm::KernelTmaWarpSpecializedPingpong", | ||||
|             MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: | ||||
|             "cutlass::gemm::KernelTmaWarpSpecializedCooperative", | ||||
|         } | ||||
|     } | ||||
| VLLMKernelScheduleTag: dict[MixedInputKernelScheduleType | KernelScheduleType, str] = { | ||||
|     **KernelScheduleTag,  # type: ignore | ||||
|     **{ | ||||
|         MixedInputKernelScheduleType.TmaWarpSpecialized: "cutlass::gemm::KernelTmaWarpSpecialized",  # noqa: E501 | ||||
|         MixedInputKernelScheduleType.TmaWarpSpecializedPingpong: "cutlass::gemm::KernelTmaWarpSpecializedPingpong",  # noqa: E501 | ||||
|         MixedInputKernelScheduleType.TmaWarpSpecializedCooperative: "cutlass::gemm::KernelTmaWarpSpecializedCooperative",  # noqa: E501 | ||||
|     }, | ||||
| } | ||||
|  | ||||
Some files were not shown because too many files have changed in this diff Show More
		Reference in New Issue
	
	Block a user
	