mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-27 11:54:35 +08:00
Compare commits
1078 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 8f89d72090 | |||
| 99dac099ab | |||
| c4bd03c7c5 | |||
| dcbf4286af | |||
| 00e6a2dc53 | |||
| 2e02311a1b | |||
| 89ec06c33b | |||
| 9fde251bf0 | |||
| 4c2ffb28ff | |||
| 246598a6b1 | |||
| 8bab4959be | |||
| 3c4cebf751 | |||
| d8f31f2f8b | |||
| 640052b069 | |||
| 351d5e7b82 | |||
| a008629807 | |||
| 76477a93b7 | |||
| 77c87beb06 | |||
| 114332b88e | |||
| cb77ad836f | |||
| 856c990041 | |||
| c5602f0baa | |||
| f7f9c5f97b | |||
| 2c0d933594 | |||
| 774d1035e4 | |||
| 6b29d6fe70 | |||
| 0bfa1c4f13 | |||
| c81da5f56d | |||
| 68bc81703e | |||
| 5884c2b454 | |||
| 45f92c00cf | |||
| 5467ac3196 | |||
| 5d7e3d0176 | |||
| 0373e1837e | |||
| c09dade2a2 | |||
| 8ea5e44a43 | |||
| 9fb900f90c | |||
| c96fc06747 | |||
| b3376e5c76 | |||
| e69ded7d1c | |||
| 767c727a81 | |||
| 6840a71610 | |||
| 7a9cb294ae | |||
| ca3ea51bde | |||
| dc49fb892c | |||
| 18a277b52d | |||
| 8d75fe48ca | |||
| 388596c914 | |||
| baa15a9ec3 | |||
| 15063741e3 | |||
| ccdc490dda | |||
| a31cab7556 | |||
| 828da0d44e | |||
| abe855d637 | |||
| 4efff036f0 | |||
| 89c920785f | |||
| 7b0a0dfb22 | |||
| 3a6ae1d33c | |||
| 8f1729b829 | |||
| 6a7c7711a2 | |||
| 0f83ddd4d7 | |||
| 065aff6c16 | |||
| 3d33e372a1 | |||
| faf71bcd4b | |||
| f270a39537 | |||
| 51a08e7d8f | |||
| eb8fcd2666 | |||
| 5563a4dea8 | |||
| ccd4f129e8 | |||
| 02cc3b51a7 | |||
| d5b1eb081e | |||
| f0a500545f | |||
| c65146e75e | |||
| 41ca62cf03 | |||
| 974fc9b845 | |||
| fee4dcc33a | |||
| 650a4cc55e | |||
| 9ca62d8668 | |||
| 45c35f0d58 | |||
| 9ba093b4f4 | |||
| 27208be66e | |||
| 87d5abef75 | |||
| ec784b2526 | |||
| a58f24e590 | |||
| f42a006b15 | |||
| 3a434b07ed | |||
| bd0e7802e0 | |||
| 06b2550cbb | |||
| f775a07e30 | |||
| 4f0d17c05c | |||
| 10c38e3e46 | |||
| cafb8e06c5 | |||
| cbb2f59cc8 | |||
| 0ab278ca31 | |||
| 7a64d24aad | |||
| dfbe60dc62 | |||
| a66cf40b20 | |||
| f790ad3c50 | |||
| ed59a7ed23 | |||
| 044793d8df | |||
| c2d6d2f960 | |||
| 8279078e21 | |||
| b9c0605a8e | |||
| 37464a0f74 | |||
| c354072828 | |||
| f081c3ce4b | |||
| 260d119e86 | |||
| a360ff80bb | |||
| 1197e02141 | |||
| 657579113f | |||
| e9899fb7a4 | |||
| a377f0bd5e | |||
| e9d3aa04f6 | |||
| a22dea54d3 | |||
| 533c217792 | |||
| 6d21fa1cad | |||
| b35be5403f | |||
| 45a1a69b98 | |||
| 87a658c812 | |||
| 429d89720e | |||
| a9bcc7afb2 | |||
| d79d9eaaff | |||
| f758505c73 | |||
| d910816c73 | |||
| 87d41c849d | |||
| e07aff9e52 | |||
| 5bf185a1c4 | |||
| 4fbcb0f27e | |||
| 7c3604fb68 | |||
| b1c255630d | |||
| eb6c50cdc2 | |||
| eecd864388 | |||
| ae495c74ea | |||
| 4238bc82f2 | |||
| 594392d27a | |||
| 18c1f16d86 | |||
| 5bd3c65072 | |||
| 616e600e0b | |||
| dfba529b40 | |||
| 5ae5ed1e60 | |||
| 290f4ada2b | |||
| dd8de11f0a | |||
| 9ba415588a | |||
| d4f3985907 | |||
| 890aa93d27 | |||
| fbdb7b3ee2 | |||
| 1102bef219 | |||
| f17a1a8f96 | |||
| d5a1697772 | |||
| 325c119961 | |||
| 8e192ff967 | |||
| e64fde4b01 | |||
| 919770957f | |||
| 6a50f4cafa | |||
| e3470f8753 | |||
| a1242324c9 | |||
| 5eda2ea02a | |||
| 2ba80bed27 | |||
| 6066253296 | |||
| ee3eea0a1b | |||
| a36de682d4 | |||
| eb6d3c264d | |||
| 97b030005c | |||
| a3a73ab069 | |||
| 8674f9880e | |||
| c74c913bfb | |||
| 5f6d10c14c | |||
| 9b9a10d6cb | |||
| 99eff67ba9 | |||
| 14772eeb8e | |||
| 757b62c495 | |||
| e941f88584 | |||
| f12c3b5b3d | |||
| d130b573a0 | |||
| 65ae8c2c8f | |||
| c3af44722c | |||
| 1937e29848 | |||
| f0eecee610 | |||
| 943e72ca56 | |||
| 546a97ef69 | |||
| da5a0b539d | |||
| 6287537a0c | |||
| b57e6c5949 | |||
| 27ce85476e | |||
| f68470e803 | |||
| 2e9a2227ec | |||
| c0724fc915 | |||
| 86b45ae065 | |||
| c5711ef985 | |||
| 48d5985a08 | |||
| 33e0823de5 | |||
| 26148120b3 | |||
| 0150a10630 | |||
| 8e7fb5d43a | |||
| 9a31a817a8 | |||
| 2060e93659 | |||
| 8435b207af | |||
| 10fa9eea21 | |||
| e08188081b | |||
| b5853f9963 | |||
| f09edd8a25 | |||
| 6979ade384 | |||
| 9216b9cc38 | |||
| 5e0391c040 | |||
| dbc0754ddf | |||
| 99caa49106 | |||
| 5c342570d7 | |||
| 973617ae02 | |||
| 30e754390c | |||
| 52f8107cf2 | |||
| fc0d9dfc3a | |||
| 361c461a12 | |||
| a5675d348b | |||
| e9cdd2b1e2 | |||
| 65bf2ac165 | |||
| 8a7cc254a0 | |||
| 29bc01bf3b | |||
| 676a99982f | |||
| dc72402b57 | |||
| ccb63a8245 | |||
| c579b750a0 | |||
| 4bfa7e7f75 | |||
| ac1fbf7fd2 | |||
| 33d3914b1e | |||
| 1356df53bd | |||
| ce532ff45c | |||
| 8bc68e198c | |||
| 0fca3cdcf2 | |||
| e7c46b9527 | |||
| 350f9e107f | |||
| 702bee461f | |||
| a7be4d0072 | |||
| a709e87a4f | |||
| 6eaccb7353 | |||
| e254497b66 | |||
| 4e12131089 | |||
| fcc2994be6 | |||
| 2e7796f2cf | |||
| 706588a77d | |||
| 6a0f617210 | |||
| dac6a3f6ed | |||
| 64b77dfd7e | |||
| 51d4094fda | |||
| e965d46184 | |||
| 208b71bcc1 | |||
| c833101740 | |||
| 379da6dcb5 | |||
| ebce310b74 | |||
| be0c5180ac | |||
| cea64430f6 | |||
| a3c124570a | |||
| ff5abcd746 | |||
| 0ee535b294 | |||
| 190bc838e1 | |||
| f12b20decc | |||
| 16bc0a098f | |||
| e288df0632 | |||
| 8b9241be3a | |||
| f942efb5a3 | |||
| 89579a201f | |||
| 230c4b38c1 | |||
| 20cfcdec99 | |||
| ad932a221d | |||
| 5510cf0e8a | |||
| 0f9a6e3d22 | |||
| f6a593093a | |||
| d7740ea4dc | |||
| cc466a3290 | |||
| 8344f7742b | |||
| 469f85c782 | |||
| 10760da800 | |||
| 478aed5827 | |||
| 63575bc2e1 | |||
| a98187cf72 | |||
| bd99d22629 | |||
| 19cb4716ee | |||
| e186d37cb1 | |||
| 323f27b904 | |||
| 0650e5935b | |||
| c7f2cf2b7f | |||
| 8d8357c8ed | |||
| 4302987069 | |||
| 021b1a2ab7 | |||
| 2a052011ca | |||
| 36fb68f947 | |||
| bc8ad68455 | |||
| 344bf7cd2d | |||
| ab50275111 | |||
| 43c413ec57 | |||
| f8e7adda21 | |||
| 7e65477e5e | |||
| 3521ba4f25 | |||
| 2d7bce9cd5 | |||
| ce3f1eedf8 | |||
| 808632d3b4 | |||
| 344a5d0c33 | |||
| 0f8a91401c | |||
| 9b5c9f9484 | |||
| 32881f3f31 | |||
| 5b8a7c1cb0 | |||
| 1ff0c73a79 | |||
| 5ad60b0cbd | |||
| fb087af52e | |||
| 7038e8b803 | |||
| 2a85f93007 | |||
| cf8cac8c70 | |||
| 5e401bce17 | |||
| 0d62fe58db | |||
| b8afa8b95a | |||
| 826b82a260 | |||
| c9d852d601 | |||
| 6ef09b08f8 | |||
| 3a922c1e7e | |||
| c47ba4aaa9 | |||
| 24bb4fe432 | |||
| a657bfc48a | |||
| 24750f4cad | |||
| b38e42fbca | |||
| 8b798eec75 | |||
| 69909126a7 | |||
| e491c7e053 | |||
| 4dc8026d86 | |||
| a88bb9b032 | |||
| 6f1df80436 | |||
| d6f4bd7cdd | |||
| c3845d82dc | |||
| a822eb3413 | |||
| f458112e8a | |||
| 2e240c69a9 | |||
| ee37328da0 | |||
| 6ad58f42c5 | |||
| dd1a50a8bc | |||
| 715c2d854d | |||
| a494140433 | |||
| 111815d482 | |||
| b31a1fb63c | |||
| 4bb53e2dde | |||
| 26f2fb5113 | |||
| fa32207842 | |||
| d627a3d837 | |||
| f4f921b7f1 | |||
| ac5ccf0156 | |||
| 73c8d677e5 | |||
| df29793dc7 | |||
| 03dd7d52bf | |||
| bf480c5302 | |||
| 9c7306ac11 | |||
| 4ea1f9678d | |||
| ba4be44c32 | |||
| d6e520e170 | |||
| 81661da7b2 | |||
| dfea173148 | |||
| 7134303cbb | |||
| 3da24c2df7 | |||
| eefeb16464 | |||
| 18d23f642a | |||
| 87f545ba6f | |||
| 8947bc3c15 | |||
| 12628d3c78 | |||
| 258a2c58d0 | |||
| aba47be3fe | |||
| a62aaf1df5 | |||
| 603ad84815 | |||
| a88081bf76 | |||
| 2f30e7c72f | |||
| a74dee9b62 | |||
| cf29b7eda4 | |||
| efffb63f58 | |||
| 15e7c675b0 | |||
| b6dcb4d442 | |||
| b5b4a398a7 | |||
| f4bc4de1b1 | |||
| bd7a8eef25 | |||
| 7ee82bef1e | |||
| fbf152d976 | |||
| 479d69fad0 | |||
| 96e90fdeb3 | |||
| a395a638c2 | |||
| 2768884ac4 | |||
| aae08249ac | |||
| 7923dcad12 | |||
| 3cd9b5bb2d | |||
| 468d761b32 | |||
| e4bf860a54 | |||
| 91f50a6fe2 | |||
| 79a268c4ab | |||
| eace8bf0b9 | |||
| 1e8f4252aa | |||
| 2b7949c1c2 | |||
| 62b5166bd4 | |||
| d86285a4a4 | |||
| d87f39e9a9 | |||
| d3c8180ac4 | |||
| 62b8aebc6f | |||
| 050f285ff6 | |||
| 8f2ea22bde | |||
| 0ae11f78ab | |||
| 34128a697e | |||
| c1b4e4157c | |||
| ceaf4ed003 | |||
| ad8d696a99 | |||
| 3d925165f2 | |||
| 1543680691 | |||
| 077f0a2e8a | |||
| e73ed0f1c6 | |||
| 296cdf8ac7 | |||
| 747b1a7147 | |||
| 95e5b087cf | |||
| a37d815b83 | |||
| 7f2593b164 | |||
| fe7d648fe5 | |||
| cc74b2b232 | |||
| 91528575ec | |||
| a22cdea371 | |||
| 682789d402 | |||
| 138485a82d | |||
| bc9df1571b | |||
| 15b86408a8 | |||
| 7be4f5628f | |||
| 8f20fc04bf | |||
| 221d93ecbf | |||
| d17c8477f1 | |||
| a134ef6f5e | |||
| 8a7a3e4436 | |||
| 8f9c28fd40 | |||
| cd2f63fb36 | |||
| 87fa80c91f | |||
| e1bb2fd52d | |||
| 705578ae14 | |||
| e8cc7967ff | |||
| 53b018edcb | |||
| 66ded03067 | |||
| 6dc1fc9cfe | |||
| 533d2a1f39 | |||
| a53222544c | |||
| fe3b5bbc23 | |||
| 8438e0569e | |||
| 11d652bd4f | |||
| d150e4f89f | |||
| e95cd87959 | |||
| 69e1d2fb69 | |||
| 05434764cd | |||
| 4e7ee664e2 | |||
| 37e84a403d | |||
| 4695397dcf | |||
| d619ae2d19 | |||
| eb46fbfda2 | |||
| 0003e9154b | |||
| e11e200736 | |||
| 8db1bf32f8 | |||
| aceb17cf2d | |||
| 563c54f760 | |||
| 2cd6b4f362 | |||
| 711a000255 | |||
| 989ae2538d | |||
| 0a430b4ae2 | |||
| ec8e3c695f | |||
| 98afde19fc | |||
| 5c2e66e487 | |||
| 546e721168 | |||
| b8aacac31a | |||
| d04973ad54 | |||
| fbb9d9eef4 | |||
| 09473ee41c | |||
| d4ec9ffb95 | |||
| 96b6a6d790 | |||
| 36729bac13 | |||
| 7fd3949a0b | |||
| 1096717ae9 | |||
| c2b4a1bce9 | |||
| e46a60aa4c | |||
| 1e96c3341a | |||
| 95e7d4a97c | |||
| 559eb852f8 | |||
| a10d3056da | |||
| 8afca50889 | |||
| 08ccee1e83 | |||
| c1dc547129 | |||
| f3d0bf7589 | |||
| e9da5a40c6 | |||
| e42df7227d | |||
| caada5e50a | |||
| 67b4221a61 | |||
| 63e7176f26 | |||
| 934d3662f7 | |||
| 92cd2e2f21 | |||
| e4c4072c94 | |||
| e35397468f | |||
| 8b317c6dd0 | |||
| bd3c144e0b | |||
| 0258b7a94b | |||
| b3104b2a10 | |||
| c2e00af523 | |||
| c013d32c75 | |||
| 11dd6ebb89 | |||
| 6c0b04515f | |||
| e23a43aef8 | |||
| e7c7067b45 | |||
| 6d592eb430 | |||
| d036198e23 | |||
| 59a6abf3c9 | |||
| bc0c0192d1 | |||
| f46864d68d | |||
| b4543c8f6b | |||
| 0ce0539d47 | |||
| 2f19283549 | |||
| 95baec828f | |||
| e4be7d70bb | |||
| 54951ac4bf | |||
| 18de883489 | |||
| 1d7c940d74 | |||
| cfaf49a167 | |||
| 9edec652e2 | |||
| e0dd4d3589 | |||
| e5043a3e75 | |||
| d03d64fd2e | |||
| 78107fa091 | |||
| c391e4b68e | |||
| 9117f892f0 | |||
| db2a6a41e2 | |||
| ca81ff5196 | |||
| b7782002e1 | |||
| 819a309c0f | |||
| aabe8f40f2 | |||
| 498eb5cfa3 | |||
| 537ee25f43 | |||
| 294f8f6665 | |||
| b95047f2da | |||
| 2ff767b513 | |||
| 3dcb3e8b98 | |||
| c64cf38673 | |||
| 76b889bf1d | |||
| c9b506dad4 | |||
| 5757d90e26 | |||
| a3c226e7eb | |||
| b321d4881b | |||
| ad6eca408b | |||
| 205b94942e | |||
| 3bec41f41a | |||
| 0739b1947f | |||
| 77a6572aa5 | |||
| 0e3f06fe9c | |||
| eb69d68804 | |||
| 7d4e1b85e7 | |||
| 93deb0b38f | |||
| ccb58b23e6 | |||
| 49782fcb76 | |||
| f03cc667a0 | |||
| 563c1d7ec5 | |||
| 9c82a1bec3 | |||
| b6d103542c | |||
| 51c31bc10c | |||
| 3ad438c66f | |||
| 203d4f82ac | |||
| 991143cfcd | |||
| 8b2d3cbc1b | |||
| 9765b5c406 | |||
| 430530fc18 | |||
| 97356f3c7e | |||
| f510395bbf | |||
| 6110c39dc8 | |||
| d8658c8cc1 | |||
| 7bc94a0fdd | |||
| 756b30a5f3 | |||
| 395aa823ea | |||
| 26422e477b | |||
| f342153b48 | |||
| 27a57cad52 | |||
| 98a42e7078 | |||
| 0267fef52a | |||
| 4716a32dd4 | |||
| c0935c96d3 | |||
| cb40b3ab6b | |||
| 515386ef3c | |||
| a4075cba4d | |||
| 96aa014d1e | |||
| 1715056fef | |||
| b51c1cc9d2 | |||
| ce567a2926 | |||
| d6ea427f04 | |||
| 14ccd94c89 | |||
| 8267b06c30 | |||
| 3492859b68 | |||
| 098e1776ba | |||
| 10e6322283 | |||
| 6d9aa00fc4 | |||
| 1182607e18 | |||
| 45b6ef6513 | |||
| 1956931436 | |||
| e24336b5a7 | |||
| d18f4e73f3 | |||
| 82c540bebf | |||
| 8f44facddd | |||
| e66b629c04 | |||
| 76879342a3 | |||
| 566b57c5c4 | |||
| 0dc72273b8 | |||
| a979d9771e | |||
| 8af890a865 | |||
| dfeb2ecc3a | |||
| 3a243095e5 | |||
| 64172a976c | |||
| f408d05c52 | |||
| 0b4997e05c | |||
| c13ad1b7bd | |||
| 819924e749 | |||
| 01bfb22b41 | |||
| e67c295b0c | |||
| 925f3332ca | |||
| b0dfa91dd7 | |||
| 56a8652f33 | |||
| 6d93d35308 | |||
| 837e185142 | |||
| 42bc386129 | |||
| 8b268a46a7 | |||
| 41deac4a3d | |||
| af9e53496f | |||
| f8a12ecc7f | |||
| 3c5ab9b811 | |||
| 743a0b7402 | |||
| bfdb1ba5c3 | |||
| cf2f084d56 | |||
| f721096d48 | |||
| e90fc21f2e | |||
| ea5f14e6ff | |||
| b7050ca7df | |||
| c188ecb080 | |||
| 865732342b | |||
| 4c07dd28c0 | |||
| 3bbff9e5ab | |||
| 6ebd02bdef | |||
| 523e30ea0c | |||
| f1c0fc3919 | |||
| 6e435de766 | |||
| 426ec4ec67 | |||
| 80e254834d | |||
| ba8ae1d84f | |||
| 84eaa68425 | |||
| 5ee14494e4 | |||
| 4ad521d8b5 | |||
| 9474e89ba4 | |||
| 20478c4d3a | |||
| 63e8b28a99 | |||
| cc63d03fbb | |||
| 2a60c9bd17 | |||
| c614cfee58 | |||
| 7341c77d69 | |||
| ef65dcfa6f | |||
| 6a9c583e73 | |||
| b37cdce2b1 | |||
| b30880a762 | |||
| 49eedea373 | |||
| 9fdf3de346 | |||
| c0c17d4896 | |||
| 097aa0ea22 | |||
| 482b0adf1b | |||
| 8c654c045f | |||
| 9101d832e6 | |||
| 93348d9458 | |||
| abfc4f3387 | |||
| 6b78837b29 | |||
| 120157fd2a | |||
| 8e67598aa6 | |||
| ad50bf4b25 | |||
| cf6ff18246 | |||
| 14e3f9a1b2 | |||
| 3123f15138 | |||
| 413366e9a2 | |||
| 10585e035e | |||
| fb96c1e98c | |||
| 8fa7357f2d | |||
| a7af4538ca | |||
| 604f235937 | |||
| 14b8ae02e7 | |||
| 03d37f2441 | |||
| a7c871680e | |||
| 429284dc37 | |||
| 253a98078a | |||
| 21539e6856 | |||
| b522c4476f | |||
| 78b6c4845a | |||
| b983ba35bd | |||
| 54be8a0be2 | |||
| dfc77408bd | |||
| c17ca8ef18 | |||
| 06ec486794 | |||
| 8fe8386591 | |||
| a37415c31b | |||
| 81653d9688 | |||
| eeab52a4ff | |||
| c33afd89f5 | |||
| 7e9bd08f60 | |||
| ae0ccb4017 | |||
| 739c350c19 | |||
| ba8dc958a3 | |||
| e221910e77 | |||
| b167109ba1 | |||
| 602358f8a8 | |||
| 49a3c8662b | |||
| b0925b3878 | |||
| 654865e21d | |||
| c9415c19d3 | |||
| 4c922709b6 | |||
| 657061fdce | |||
| 2f8844ba08 | |||
| 4b59f00e91 | |||
| 9e8744a545 | |||
| e4a28e5316 | |||
| 0bba88df03 | |||
| 8437bae6ef | |||
| f48c6791b7 | |||
| c2c5e0909a | |||
| 1cb0cc2975 | |||
| 99c3cfb83c | |||
| 1ece1ae829 | |||
| c59e120c55 | |||
| d2339d6840 | |||
| b35cc93420 | |||
| 8cbba4622c | |||
| 385da2dae2 | |||
| 2daf23ab0c | |||
| cbf4c05b15 | |||
| d3c04b6a39 | |||
| 4cb3b924cd | |||
| a33ce60c66 | |||
| 24aecf421a | |||
| 2efce05dc3 | |||
| 8999ec3c16 | |||
| 05af6da8d9 | |||
| 9a4548bae7 | |||
| ff578cae54 | |||
| 22de45235c | |||
| 76e8a70476 | |||
| 9cbc7e5f3b | |||
| 27a7b070db | |||
| 901cf4c52b | |||
| d0fae88114 | |||
| 17c3103c56 | |||
| 996d095c54 | |||
| d65fac2738 | |||
| ce4f5a29fb | |||
| baee28c46c | |||
| 29e70e3e88 | |||
| 82091b864a | |||
| c0c2335ce0 | |||
| 90fbf12540 | |||
| 49d849b3ab | |||
| 27ca23dc00 | |||
| 54d3544784 | |||
| 703e42ee4b | |||
| 29a8d6a554 | |||
| 2c08ff23c0 | |||
| bfdcfa6a05 | |||
| 9289e577ec | |||
| a6d471c759 | |||
| 01a5d18a53 | |||
| 929b4f2973 | |||
| 3b7178cfa4 | |||
| e46fa5d52e | |||
| a8683102cc | |||
| 71bcaf99e2 | |||
| 8b430d7dea | |||
| e0ade06d63 | |||
| 4bd18ec0c7 | |||
| 2410e320b3 | |||
| 48a8f4a7fd | |||
| 4dd6416faf | |||
| c1c0d00b88 | |||
| d9f726c4d0 | |||
| d6e4a130b0 | |||
| cfc15a1031 | |||
| 70f3e8e3a1 | |||
| ef978fe411 | |||
| f7c1234990 | |||
| 57f044945f | |||
| 4caf7044e0 | |||
| 6f32cddf1c | |||
| c530e2cfe3 | |||
| fd5dcc5c81 | |||
| 93dc5a2870 | |||
| 95529e3253 | |||
| 344020c926 | |||
| 5574081c49 | |||
| d7f396486e | |||
| 8fbd84bf78 | |||
| 7d2dcce175 | |||
| dc903e70ac | |||
| a9c8212895 | |||
| c20ecb6a51 | |||
| 5253edaacb | |||
| 017d9f1515 | |||
| 181b27d881 | |||
| 63e2a6419d | |||
| 264017a2bf | |||
| e433c115bc | |||
| 86fd8bb0ac | |||
| ab3a5a8259 | |||
| a61f0521b8 | |||
| 537c9755a7 | |||
| 786b7f18a5 | |||
| 8f36444c4f | |||
| 185b2c29e2 | |||
| 5f08050d8d | |||
| 64da65b322 | |||
| 5255d99dc5 | |||
| 4f2ad11135 | |||
| d7afab6d3a | |||
| 31348dff03 | |||
| 25e86b6a61 | |||
| 4efbac6d35 | |||
| 87069ccf68 | |||
| 7e45107f51 | |||
| 0c48b37c31 | |||
| 7eacffd951 | |||
| 2a543d6efe | |||
| 317b29de0f | |||
| a463c333dd | |||
| ea356004d4 | |||
| 5c976a7e1a | |||
| f964493274 | |||
| a4211a4dc3 | |||
| 563836496a | |||
| 4ca2c358b1 | |||
| 0580aab02f | |||
| 3711811b1d | |||
| 65b89d16ee | |||
| 931746bc6d | |||
| c81dddb45c | |||
| fe6d09ae61 | |||
| ed70c70ea3 | |||
| f0d4e14557 | |||
| 2ccee3def6 | |||
| b92adec8e8 | |||
| 56f738ae9b | |||
| 72d3a30c63 | |||
| c9b45adeeb | |||
| 5a6c81b051 | |||
| 51cd22ce56 | |||
| 5ed704ec8c | |||
| 4abf6336ec | |||
| 0e163fce18 | |||
| 96b6f475dd | |||
| c410f5d020 | |||
| bb8c697ee0 | |||
| b9e96b17de | |||
| 923797fea4 | |||
| cd9e60c76c | |||
| 93b38bea5d | |||
| d0d93b92b1 | |||
| 89efcf1ce5 | |||
| c664b0e683 | |||
| d69ff0cbbb | |||
| 1af090b57d | |||
| 3dad944485 | |||
| 105a40f53a | |||
| bbe9bd9684 | |||
| 4f65af0e25 | |||
| d79ced3292 | |||
| ab40644669 | |||
| 5d60def02c | |||
| ea8489fce2 | |||
| 1b20639a43 | |||
| b72af8f1ed | |||
| 9090bf02e7 | |||
| 7d648418b8 | |||
| 89be30fa7d | |||
| f8ecb84c02 | |||
| 5f036d2bcc | |||
| 380170038e | |||
| 220a47627b | |||
| beb89f68b4 | |||
| 390b495ff3 | |||
| 3a0e1fc070 | |||
| 6b7de1a030 | |||
| 5265631d15 | |||
| 2832e7b9f9 | |||
| 3a7dd7e367 | |||
| 223c19224b | |||
| f1f6cc10c7 | |||
| 3209b49033 | |||
| 1e4277d2d1 | |||
| 9b945daaf1 | |||
| 9c1352eb57 | |||
| 7a0b011dd5 | |||
| 63e835cbcc | |||
| 94b5edeb53 | |||
| ab7e6006d6 | |||
| 18bfcdd05c | |||
| 71d63ed72e | |||
| d75c40734a | |||
| 5b23c3f26f | |||
| 00efdc84ba | |||
| 91a61da9b1 | |||
| ef9b636e2d | |||
| 2709c0009a | |||
| dd7e8f5f64 | |||
| d2a68364c4 | |||
| 7e1081139d | |||
| 18473cf498 | |||
| 4df417d059 | |||
| 5d80a9178b | |||
| 8a25d3a71a | |||
| d10f8e1d43 | |||
| 14cc317ba4 | |||
| e1957c6ebd | |||
| 8cd5a992bf | |||
| 947f0b23cc | |||
| f780504d12 | |||
| bfc072addf | |||
| 2a18da257c | |||
| 6e01e8c1c8 | |||
| 9f659bf07f | |||
| 35c4bc20d9 | |||
| 218dc2ccda | |||
| 827cbcd37c | |||
| cb7a1c1cbf | |||
| 7878958c0d | |||
| ce036244c9 | |||
| 48cf1e413c | |||
| 97460585d9 | |||
| f745847ef7 | |||
| 6549aef245 | |||
| 50376faa7b | |||
| 4b61c6b669 | |||
| 79d64c4954 | |||
| 74cd5abdd1 | |||
| 28c3f12104 | |||
| c884819135 | |||
| 05921a9a7a | |||
| d0215a58e7 | |||
| 937e7b7d7c | |||
| aee8ef661a | |||
| 2e0b6e7757 | |||
| 941767127c | |||
| 74d8d77626 | |||
| fd4ea8ef5c | |||
| 1066cbd152 | |||
| 6ef00b03a2 | |||
| 9140561059 | |||
| 77af974b40 | |||
| 4934d49274 | |||
| 358c328d69 | |||
| 4aaafdd289 | |||
| 66b108d142 | |||
| e0ff920001 | |||
| face83c7ec | |||
| 1db83e31a2 | |||
| a1b9cb2a34 | |||
| 3a4fd5ca59 | |||
| c17daa9f89 | |||
| bd29cf3d3a | |||
| 31bff69151 | |||
| ba4f826738 | |||
| de60a3fb93 | |||
| 21d5daa4ac | |||
| 290e015c6c | |||
| 1b7c791d60 | |||
| bbe4466fd9 | |||
| 08133c4d1a | |||
| 76a7983b23 | |||
| 8041b7305e | |||
| 3ec8c25cd0 | |||
| 671af2b1c0 | |||
| 6f41f0e377 | |||
| 2c9b638065 | |||
| a7347d9a6d | |||
| f8c688d746 | |||
| c9fadda543 | |||
| 30fb0956df | |||
| 3a765bd5e1 | |||
| 26c52a5ea6 | |||
| c3372e87be | |||
| b0a1d667b0 | |||
| e1d5402238 | |||
| 3d1cfbfc74 | |||
| 37ca558103 | |||
| eed74a558f | |||
| 2acd76f346 | |||
| b81a6a6bb3 | |||
| 0fbfc4b81b | |||
| c06170cc8e | |||
| 614856da25 | |||
| 05bdf4eaf3 | |||
| 6774bd50b0 | |||
| 31c1f3255e | |||
| 21d93c140d | |||
| f1c8520146 | |||
| 096827c284 | |||
| 6565d9e33e | |||
| f375ec8440 | |||
| 518369d78c | |||
| 30bad5c492 | |||
| 3fefe271ec | |||
| 6428f1d051 | |||
| 7e1b21daac | |||
| cb3f30c600 | |||
| f3e024bece | |||
| 31d2ab4aff | |||
| eb17212858 | |||
| 4dd4b5c538 | |||
| 6120e5aaea | |||
| 2eaa81b236 | |||
| 81ce2a4b26 | |||
| 5dd80d3777 | |||
| beeee69bc9 | |||
| 9bf28d0b69 | |||
| c0ce15dfb2 | |||
| b9bcdc7158 | |||
| 4ff0203987 | |||
| b5f882cc98 | |||
| 2e8fc0d4c3 | |||
| dacaf5a400 | |||
| 24cde76a15 | |||
| 1aa1361510 | |||
| fe470ae5ad | |||
| 3a8c2381f7 | |||
| c85b80c2b6 | |||
| 2b981012a6 | |||
| 6ccc0bfffb | |||
| c8e7eb1eb3 | |||
| 24f60a54f4 | |||
| 42c02f5892 | |||
| ebede26ebf | |||
| d940ce497e | |||
| 05ff90b692 | |||
| 1d9b737e05 | |||
| 60dc62dc9e | |||
| 0f90effc66 | |||
| 464dd985e3 | |||
| c07a442854 | |||
| cd3aa153a4 | |||
| 9b294976a2 | |||
| 5313c2cb8b | |||
| 5f09cbdb63 | |||
| 4cefa9b49b | |||
| f86bd6190a | |||
| e5452ddfd6 | |||
| d06980dfa7 | |||
| 66785cc05c | |||
| 05a38612b0 | |||
| d27f4bae39 | |||
| 8d8c2f6ffe | |||
| 51d3cb951d | |||
| e74b1736a1 | |||
| f07c1ceaa5 | |||
| 63b2206ad0 | |||
| 27feead2f8 | |||
| c782195662 | |||
| 0f621c2c7d | |||
| a9e4574261 | |||
| 0229c386c5 | |||
| a7b3e33078 | |||
| e19a64c7ef | |||
| 1cb4ad8de9 | |||
| 6ed068a71a | |||
| 708e6c18b0 | |||
| b943890484 | |||
| a1125ad4df | |||
| a8b150c595 | |||
| 665cbcec4b | |||
| 7c600440f7 | |||
| e0c6f556e8 | |||
| de23687d16 | |||
| 4cea74c73b | |||
| a921d8be9d | |||
| 094f716bf2 | |||
| 7d761fe3c1 | |||
| cf35d8f3d7 | |||
| 4bb6b67188 | |||
| 819b18e7ba | |||
| 19849db573 | |||
| 3d4ceb292c | |||
| f5a37c6c6c | |||
| 32c927b53f | |||
| 5ffc0d13a2 | |||
| 112627e8b2 | |||
| 37c1e3c218 | |||
| 06e9ebebd5 |
36
.buildkite/check-wheel-size.py
Normal file
36
.buildkite/check-wheel-size.py
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
import os
|
||||||
|
import zipfile
|
||||||
|
|
||||||
|
MAX_SIZE_MB = 200
|
||||||
|
|
||||||
|
|
||||||
|
def print_top_10_largest_files(zip_file):
|
||||||
|
with zipfile.ZipFile(zip_file, 'r') as z:
|
||||||
|
file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
|
||||||
|
file_sizes.sort(key=lambda x: x[1], reverse=True)
|
||||||
|
for f, size in file_sizes[:10]:
|
||||||
|
print(f"{f}: {size/(1024*1024)} MBs uncompressed.")
|
||||||
|
|
||||||
|
|
||||||
|
def check_wheel_size(directory):
|
||||||
|
for root, _, files in os.walk(directory):
|
||||||
|
for f in files:
|
||||||
|
if f.endswith(".whl"):
|
||||||
|
wheel_path = os.path.join(root, f)
|
||||||
|
wheel_size = os.path.getsize(wheel_path)
|
||||||
|
wheel_size_mb = wheel_size / (1024 * 1024)
|
||||||
|
if wheel_size_mb > MAX_SIZE_MB:
|
||||||
|
print(
|
||||||
|
f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
|
||||||
|
f"compare to the allowed size ({MAX_SIZE_MB} MB).")
|
||||||
|
print_top_10_largest_files(wheel_path)
|
||||||
|
return 1
|
||||||
|
else:
|
||||||
|
print(f"Wheel {wheel_path} is within the allowed size "
|
||||||
|
f"({wheel_size_mb} MB).")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import sys
|
||||||
|
sys.exit(check_wheel_size(sys.argv[1]))
|
||||||
18
.buildkite/download-images.sh
Normal file
18
.buildkite/download-images.sh
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
set -o pipefail
|
||||||
|
|
||||||
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
|
|
||||||
|
# aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
|
||||||
|
mkdir -p images
|
||||||
|
cd images
|
||||||
|
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
|
||||||
|
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
|
||||||
|
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
|
||||||
|
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
|
||||||
|
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
|
||||||
|
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
|
||||||
|
|
||||||
|
cd -
|
||||||
26
.buildkite/nightly-benchmarks/kickoff-pipeline.sh
Executable file
26
.buildkite/nightly-benchmarks/kickoff-pipeline.sh
Executable file
@ -0,0 +1,26 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
# Install system packages
|
||||||
|
apt update
|
||||||
|
apt install -y curl jq
|
||||||
|
|
||||||
|
# Install minijinja for templating
|
||||||
|
curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh
|
||||||
|
source $HOME/.cargo/env
|
||||||
|
|
||||||
|
# If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
|
||||||
|
if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
|
||||||
|
PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
|
||||||
|
|
||||||
|
if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then
|
||||||
|
echo "This PR has the 'perf-benchmarks' label. Proceeding with the nightly benchmarks."
|
||||||
|
else
|
||||||
|
echo "This PR does not have the 'perf-benchmarks' label. Skipping the nightly benchmarks."
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Upload sample.yaml
|
||||||
|
buildkite-agent pipeline upload .buildkite/nightly-benchmarks/sample.yaml
|
||||||
39
.buildkite/nightly-benchmarks/sample.yaml
Normal file
39
.buildkite/nightly-benchmarks/sample.yaml
Normal file
@ -0,0 +1,39 @@
|
|||||||
|
steps:
|
||||||
|
# NOTE(simon): You can create separate blocks for different jobs
|
||||||
|
- label: "A100: NVIDIA SMI"
|
||||||
|
agents:
|
||||||
|
queue: A100
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
containers:
|
||||||
|
# - image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT
|
||||||
|
# TODO(simon): check latest main branch or use the PR image.
|
||||||
|
- image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:45c35f0d58f4508bf43bd6af1d3d0d0ec0c915e6
|
||||||
|
command:
|
||||||
|
- bash -c 'nvidia-smi && nvidia-smi topo -m && pwd && ls'
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: 8
|
||||||
|
volumeMounts:
|
||||||
|
- name: devshm
|
||||||
|
mountPath: /dev/shm
|
||||||
|
nodeSelector:
|
||||||
|
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
|
||||||
|
volumes:
|
||||||
|
- name: devshm
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
# TODO(simon): bring H100 online
|
||||||
|
# - label: "H100: NVIDIA SMI"
|
||||||
|
# agents:
|
||||||
|
# queue: H100
|
||||||
|
# plugins:
|
||||||
|
# - docker#v5.11.0:
|
||||||
|
# image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:45c35f0d58f4508bf43bd6af1d3d0d0ec0c915e6
|
||||||
|
# command:
|
||||||
|
# - bash -c 'nvidia-smi && nvidia-smi topo -m'
|
||||||
|
# propagate-environment: true
|
||||||
|
# ipc: host
|
||||||
|
# gpus: all
|
||||||
|
|
||||||
73
.buildkite/run-amd-test.sh
Normal file
73
.buildkite/run-amd-test.sh
Normal file
@ -0,0 +1,73 @@
|
|||||||
|
# This script runs test inside the corresponding ROCm docker container.
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Print ROCm version
|
||||||
|
echo "--- ROCm info"
|
||||||
|
rocminfo
|
||||||
|
|
||||||
|
# cleanup older docker images
|
||||||
|
cleanup_docker() {
|
||||||
|
# Get Docker's root directory
|
||||||
|
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||||
|
if [ -z "$docker_root" ]; then
|
||||||
|
echo "Failed to determine Docker root directory."
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Docker root directory: $docker_root"
|
||||||
|
# Check disk usage of the filesystem where Docker's root directory is located
|
||||||
|
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||||
|
# Define the threshold
|
||||||
|
threshold=70
|
||||||
|
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||||
|
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||||
|
# Remove dangling images (those that are not tagged and not used by any container)
|
||||||
|
docker image prune -f
|
||||||
|
# Remove unused volumes
|
||||||
|
docker volume prune -f
|
||||||
|
echo "Docker images and volumes cleanup completed."
|
||||||
|
else
|
||||||
|
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Call the cleanup docker function
|
||||||
|
cleanup_docker
|
||||||
|
|
||||||
|
echo "--- Resetting GPUs"
|
||||||
|
|
||||||
|
echo "reset" > /opt/amdgpu/etc/gpu_state
|
||||||
|
|
||||||
|
while true; do
|
||||||
|
sleep 3
|
||||||
|
if grep -q clean /opt/amdgpu/etc/gpu_state; then
|
||||||
|
echo "GPUs state is \"clean\""
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
echo "--- Building container"
|
||||||
|
sha=$(git rev-parse --short HEAD)
|
||||||
|
image_name=rocm_${sha}
|
||||||
|
container_name=rocm_${sha}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)
|
||||||
|
docker build \
|
||||||
|
-t ${image_name} \
|
||||||
|
-f Dockerfile.rocm \
|
||||||
|
--progress plain \
|
||||||
|
.
|
||||||
|
|
||||||
|
remove_docker_container() {
|
||||||
|
docker rm -f ${container_name} || docker image rm -f ${image_name} || true
|
||||||
|
}
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
|
||||||
|
echo "--- Running container"
|
||||||
|
|
||||||
|
docker run \
|
||||||
|
--device /dev/kfd --device /dev/dri \
|
||||||
|
--network host \
|
||||||
|
--rm \
|
||||||
|
-e HF_TOKEN \
|
||||||
|
--name ${container_name} \
|
||||||
|
${image_name} \
|
||||||
|
/bin/bash -c "${@}"
|
||||||
|
|
||||||
78
.buildkite/run-benchmarks.sh
Normal file
78
.buildkite/run-benchmarks.sh
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
# This script is run by buildkite to run the benchmarks and upload the results to buildkite
|
||||||
|
|
||||||
|
set -ex
|
||||||
|
set -o pipefail
|
||||||
|
|
||||||
|
# cd into parent directory of this file
|
||||||
|
cd "$(dirname "${BASH_SOURCE[0]}")/.."
|
||||||
|
|
||||||
|
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
|
||||||
|
|
||||||
|
# run python-based benchmarks and upload the result to buildkite
|
||||||
|
python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
|
||||||
|
bench_latency_exit_code=$?
|
||||||
|
|
||||||
|
python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
|
||||||
|
bench_throughput_exit_code=$?
|
||||||
|
|
||||||
|
# run server-based benchmarks and upload the result to buildkite
|
||||||
|
python3 -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-2-7b-chat-hf &
|
||||||
|
server_pid=$!
|
||||||
|
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
|
||||||
|
# wait for server to start, timeout after 600 seconds
|
||||||
|
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||||
|
python3 benchmarks/benchmark_serving.py \
|
||||||
|
--backend vllm \
|
||||||
|
--dataset-name sharegpt \
|
||||||
|
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||||
|
--model meta-llama/Llama-2-7b-chat-hf \
|
||||||
|
--num-prompts 20 \
|
||||||
|
--endpoint /v1/completions \
|
||||||
|
--tokenizer meta-llama/Llama-2-7b-chat-hf \
|
||||||
|
--save-result \
|
||||||
|
2>&1 | tee benchmark_serving.txt
|
||||||
|
bench_serving_exit_code=$?
|
||||||
|
kill $server_pid
|
||||||
|
|
||||||
|
# write the results into a markdown file
|
||||||
|
echo "### Latency Benchmarks" >> benchmark_results.md
|
||||||
|
sed -n '1p' benchmark_latency.txt >> benchmark_results.md # first line
|
||||||
|
echo "" >> benchmark_results.md
|
||||||
|
sed -n '$p' benchmark_latency.txt >> benchmark_results.md # last line
|
||||||
|
|
||||||
|
echo "### Throughput Benchmarks" >> benchmark_results.md
|
||||||
|
sed -n '1p' benchmark_throughput.txt >> benchmark_results.md # first line
|
||||||
|
echo "" >> benchmark_results.md
|
||||||
|
sed -n '$p' benchmark_throughput.txt >> benchmark_results.md # last line
|
||||||
|
|
||||||
|
echo "### Serving Benchmarks" >> benchmark_results.md
|
||||||
|
sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
|
||||||
|
echo "" >> benchmark_results.md
|
||||||
|
echo '```' >> benchmark_results.md
|
||||||
|
tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines
|
||||||
|
echo '```' >> benchmark_results.md
|
||||||
|
|
||||||
|
# if the agent binary is not found, skip uploading the results, exit 0
|
||||||
|
if [ ! -f /usr/bin/buildkite-agent ]; then
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
# upload the results to buildkite
|
||||||
|
buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
|
||||||
|
|
||||||
|
# exit with the exit code of the benchmarks
|
||||||
|
if [ $bench_latency_exit_code -ne 0 ]; then
|
||||||
|
exit $bench_latency_exit_code
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $bench_throughput_exit_code -ne 0 ]; then
|
||||||
|
exit $bench_throughput_exit_code
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ $bench_serving_exit_code -ne 0 ]; then
|
||||||
|
exit $bench_serving_exit_code
|
||||||
|
fi
|
||||||
|
|
||||||
|
rm ShareGPT_V3_unfiltered_cleaned_split.json
|
||||||
|
buildkite-agent artifact upload "*.json"
|
||||||
24
.buildkite/run-cpu-test.sh
Normal file
24
.buildkite/run-cpu-test.sh
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
# This script build the CPU docker image and run the offline inference inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -ex
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
docker build -t cpu-test -f Dockerfile.cpu .
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() { docker rm -f cpu-test || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Run the image
|
||||||
|
docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
|
||||||
|
|
||||||
|
# offline inference
|
||||||
|
docker exec cpu-test bash -c "python3 examples/offline_inference.py"
|
||||||
|
|
||||||
|
# Run basic model test
|
||||||
|
docker exec cpu-test bash -c "cd tests;
|
||||||
|
pip install pytest Pillow protobuf
|
||||||
|
bash ../.buildkite/download-images.sh
|
||||||
|
cd ../
|
||||||
|
pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"
|
||||||
51
.buildkite/run-neuron-test.sh
Normal file
51
.buildkite/run-neuron-test.sh
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
# This script build the Neuron docker image and run the API server inside the container.
|
||||||
|
# It serves a sanity check for compilation and basic model usage.
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# Try building the docker image
|
||||||
|
aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
|
||||||
|
|
||||||
|
# prune old image and containers to save disk space, and only once a day
|
||||||
|
# by using a timestamp file in tmp.
|
||||||
|
if [ -f /tmp/neuron-docker-build-timestamp ]; then
|
||||||
|
last_build=$(cat /tmp/neuron-docker-build-timestamp)
|
||||||
|
current_time=$(date +%s)
|
||||||
|
if [ $((current_time - last_build)) -gt 86400 ]; then
|
||||||
|
docker system prune -f
|
||||||
|
echo $current_time > /tmp/neuron-docker-build-timestamp
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo $(date +%s) > /tmp/neuron-docker-build-timestamp
|
||||||
|
fi
|
||||||
|
|
||||||
|
docker build -t neuron -f Dockerfile.neuron .
|
||||||
|
|
||||||
|
# Setup cleanup
|
||||||
|
remove_docker_container() { docker rm -f neuron || true; }
|
||||||
|
trap remove_docker_container EXIT
|
||||||
|
remove_docker_container
|
||||||
|
|
||||||
|
# Run the image
|
||||||
|
docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
|
||||||
|
--model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
|
||||||
|
|
||||||
|
# Wait for the server to start
|
||||||
|
wait_for_server_to_start() {
|
||||||
|
timeout=300
|
||||||
|
counter=0
|
||||||
|
|
||||||
|
while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
|
||||||
|
sleep 1
|
||||||
|
counter=$((counter + 1))
|
||||||
|
if [ $counter -ge $timeout ]; then
|
||||||
|
echo "Timeout after $timeout seconds"
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
}
|
||||||
|
wait_for_server_to_start
|
||||||
|
|
||||||
|
# Test a simple prompt
|
||||||
|
curl -X POST -H "Content-Type: application/json" \
|
||||||
|
localhost:8000/generate \
|
||||||
|
-d '{"prompt": "San Francisco is a"}'
|
||||||
169
.buildkite/test-pipeline.yaml
Normal file
169
.buildkite/test-pipeline.yaml
Normal file
@ -0,0 +1,169 @@
|
|||||||
|
# In this file, you can add more tests to run either by adding a new step or
|
||||||
|
# adding a new command to an existing step. See different options here for examples.
|
||||||
|
# This script will be feed into Jinja template in `test-template.j2` to generate
|
||||||
|
# the final pipeline yaml file.
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- label: Regression Test
|
||||||
|
mirror_hardwares: [amd]
|
||||||
|
command: pytest -v -s test_regression.py
|
||||||
|
working_dir: "/vllm-workspace/tests" # optional
|
||||||
|
|
||||||
|
- label: AsyncEngine Test
|
||||||
|
#mirror_hardwares: [amd]
|
||||||
|
command: pytest -v -s async_engine
|
||||||
|
|
||||||
|
- label: Basic Correctness Test
|
||||||
|
mirror_hardwares: [amd]
|
||||||
|
commands:
|
||||||
|
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
|
||||||
|
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
|
||||||
|
- VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||||
|
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
|
||||||
|
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||||
|
|
||||||
|
- label: Core Test
|
||||||
|
mirror_hardwares: [amd]
|
||||||
|
command: pytest -v -s core
|
||||||
|
|
||||||
|
- label: Distributed Comm Ops Test
|
||||||
|
#mirror_hardwares: [amd]
|
||||||
|
command: pytest -v -s distributed/test_comm_ops.py
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 2
|
||||||
|
|
||||||
|
- label: Distributed Tests
|
||||||
|
mirror_hardwares: [amd]
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 2
|
||||||
|
commands:
|
||||||
|
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
|
||||||
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
|
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||||
|
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||||
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
|
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
|
||||||
|
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||||
|
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
|
||||||
|
- pytest -v -s spec_decode/e2e/test_integration_dist.py
|
||||||
|
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
|
||||||
|
|
||||||
|
- label: Distributed Tests (Multiple Groups)
|
||||||
|
#mirror_hardwares: [amd]
|
||||||
|
working_dir: "/vllm-workspace/tests"
|
||||||
|
num_gpus: 4
|
||||||
|
commands:
|
||||||
|
- pytest -v -s distributed/test_pynccl.py
|
||||||
|
|
||||||
|
- label: Engine Test
|
||||||
|
mirror_hardwares: [amd]
|
||||||
|
command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
|
||||||
|
|
||||||
|
- label: Entrypoints Test
|
||||||
|
mirror_hardwares: [amd]
|
||||||
|
|
||||||
|
commands:
|
||||||
|
- pytest -v -s entrypoints -m llm
|
||||||
|
- pytest -v -s entrypoints -m openai
|
||||||
|
|
||||||
|
- label: Examples Test
|
||||||
|
working_dir: "/vllm-workspace/examples"
|
||||||
|
mirror_hardwares: [amd]
|
||||||
|
commands:
|
||||||
|
# install aws cli for llava_example.py
|
||||||
|
# install tensorizer for tensorize_vllm_model.py
|
||||||
|
- pip install awscli tensorizer
|
||||||
|
- python3 offline_inference.py
|
||||||
|
- python3 offline_inference_with_prefix.py
|
||||||
|
- python3 llm_engine_example.py
|
||||||
|
- python3 llava_example.py
|
||||||
|
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
|
||||||
|
|
||||||
|
- label: Inputs Test
|
||||||
|
#mirror_hardwares: [amd]
|
||||||
|
commands:
|
||||||
|
- bash ../.buildkite/download-images.sh
|
||||||
|
- pytest -v -s test_inputs.py
|
||||||
|
- pytest -v -s multimodal
|
||||||
|
|
||||||
|
- label: Kernels Test %N
|
||||||
|
#mirror_hardwares: [amd]
|
||||||
|
command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
|
||||||
|
parallelism: 4
|
||||||
|
|
||||||
|
- label: Models Test
|
||||||
|
#mirror_hardwares: [amd]
|
||||||
|
commands:
|
||||||
|
- pytest -v -s models -m \"not llava\"
|
||||||
|
|
||||||
|
- label: Llava Test
|
||||||
|
mirror_hardwares: [amd]
|
||||||
|
commands:
|
||||||
|
- bash ../.buildkite/download-images.sh
|
||||||
|
- pytest -v -s models -m llava
|
||||||
|
|
||||||
|
- label: Prefix Caching Test
|
||||||
|
mirror_hardwares: [amd]
|
||||||
|
commands:
|
||||||
|
- pytest -v -s prefix_caching
|
||||||
|
|
||||||
|
- label: Samplers Test
|
||||||
|
#mirror_hardwares: [amd]
|
||||||
|
command: pytest -v -s samplers
|
||||||
|
|
||||||
|
- label: LogitsProcessor Test
|
||||||
|
mirror_hardwares: [amd]
|
||||||
|
command: pytest -v -s test_logits_processor.py
|
||||||
|
|
||||||
|
- label: Utils Test
|
||||||
|
command: pytest -v -s test_utils.py
|
||||||
|
|
||||||
|
- label: Worker Test
|
||||||
|
mirror_hardwares: [amd]
|
||||||
|
command: pytest -v -s worker
|
||||||
|
|
||||||
|
- label: Speculative decoding tests
|
||||||
|
#mirror_hardwares: [amd]
|
||||||
|
commands:
|
||||||
|
# See https://github.com/vllm-project/vllm/issues/5152
|
||||||
|
- export VLLM_ATTENTION_BACKEND=XFORMERS
|
||||||
|
- pytest -v -s spec_decode
|
||||||
|
|
||||||
|
- label: LoRA Test %N
|
||||||
|
#mirror_hardwares: [amd]
|
||||||
|
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
|
||||||
|
parallelism: 4
|
||||||
|
|
||||||
|
- label: LoRA Long Context (Distributed)
|
||||||
|
#mirror_hardwares: [amd]
|
||||||
|
num_gpus: 4
|
||||||
|
# This test runs llama 13B, so it is required to run on 4 GPUs.
|
||||||
|
commands:
|
||||||
|
- pytest -v -s -x lora/test_long_context.py
|
||||||
|
|
||||||
|
- label: Tensorizer Test
|
||||||
|
#mirror_hardwares: [amd]
|
||||||
|
command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
|
||||||
|
|
||||||
|
- label: Metrics Test
|
||||||
|
mirror_hardwares: [amd]
|
||||||
|
command: pytest -v -s metrics
|
||||||
|
|
||||||
|
- label: Quantization Test
|
||||||
|
#mirror_hardwares: [amd]
|
||||||
|
command: pytest -v -s quantization
|
||||||
|
|
||||||
|
- label: Benchmarks
|
||||||
|
working_dir: "/vllm-workspace/.buildkite"
|
||||||
|
mirror_hardwares: [amd]
|
||||||
|
commands:
|
||||||
|
- pip install aiohttp
|
||||||
|
- bash run-benchmarks.sh
|
||||||
|
|
||||||
|
- label: Documentation Build
|
||||||
|
working_dir: "/vllm-workspace/test_docs/docs"
|
||||||
|
no_gpu: True
|
||||||
|
commands:
|
||||||
|
- pip install -r requirements-docs.txt
|
||||||
|
- SPHINXOPTS=\"-W\" make html
|
||||||
64
.buildkite/test-template-aws.j2
Normal file
64
.buildkite/test-template-aws.j2
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
|
||||||
|
{% set default_working_dir = "/vllm-workspace/tests" %}
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- label: ":docker: build image"
|
||||||
|
agents:
|
||||||
|
queue: cpu_queue
|
||||||
|
commands:
|
||||||
|
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||||
|
- "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
|
||||||
|
- "docker push {{ docker_image }}"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
retry:
|
||||||
|
automatic:
|
||||||
|
- exit_status: -1 # Agent was lost
|
||||||
|
limit: 5
|
||||||
|
- exit_status: -10 # Agent was lost
|
||||||
|
limit: 5
|
||||||
|
- wait
|
||||||
|
|
||||||
|
{% for step in steps %}
|
||||||
|
- label: "{{ step.label }}"
|
||||||
|
agents:
|
||||||
|
{% if step.label == "Documentation Build" %}
|
||||||
|
queue: small_cpu_queue
|
||||||
|
{% elif step.no_gpu %}
|
||||||
|
queue: cpu_queue
|
||||||
|
{% elif step.num_gpus == 2 or step.num_gpus == 4 %}
|
||||||
|
queue: gpu_4_queue
|
||||||
|
{% else %}
|
||||||
|
queue: gpu_1_queue
|
||||||
|
{% endif %}
|
||||||
|
soft_fail: true
|
||||||
|
{% if step.parallelism %}
|
||||||
|
parallelism: {{ step.parallelism }}
|
||||||
|
{% endif %}
|
||||||
|
retry:
|
||||||
|
automatic:
|
||||||
|
- exit_status: -1 # Agent was lost
|
||||||
|
limit: 5
|
||||||
|
- exit_status: -10 # Agent was lost
|
||||||
|
limit: 5
|
||||||
|
plugins:
|
||||||
|
- docker#v5.2.0:
|
||||||
|
image: {{ docker_image }}
|
||||||
|
always-pull: true
|
||||||
|
propagate-environment: true
|
||||||
|
{% if not step.no_gpu %}
|
||||||
|
gpus: all
|
||||||
|
{% endif %}
|
||||||
|
{% if step.label == "Benchmarks" %}
|
||||||
|
mount-buildkite-agent: true
|
||||||
|
{% endif %}
|
||||||
|
command: ["bash", "-c", "cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}"]
|
||||||
|
environment:
|
||||||
|
- VLLM_USAGE_SOURCE=ci-test
|
||||||
|
- HF_TOKEN
|
||||||
|
{% if step.label == "Speculative decoding tests" %}
|
||||||
|
- VLLM_ATTENTION_BACKEND=XFORMERS
|
||||||
|
{% endif %}
|
||||||
|
volumes:
|
||||||
|
- /dev/shm:/dev/shm
|
||||||
|
{% endfor %}
|
||||||
96
.buildkite/test-template.j2
Normal file
96
.buildkite/test-template.j2
Normal file
@ -0,0 +1,96 @@
|
|||||||
|
{% set docker_image = "us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT" %}
|
||||||
|
{% set default_num_gpu = 1 %}
|
||||||
|
{% set default_working_dir = "/vllm-workspace/tests" %}
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- label: ":docker: build image"
|
||||||
|
commands:
|
||||||
|
- "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
|
||||||
|
- "docker push {{ docker_image }}"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
retry:
|
||||||
|
automatic:
|
||||||
|
- exit_status: -1 # Agent was lost
|
||||||
|
limit: 5
|
||||||
|
- exit_status: -10 # Agent was lost
|
||||||
|
limit: 5
|
||||||
|
- wait
|
||||||
|
|
||||||
|
- group: "AMD Tests"
|
||||||
|
depends_on: ~
|
||||||
|
steps:
|
||||||
|
{% for step in steps %}
|
||||||
|
{% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
|
||||||
|
- label: "AMD: {{ step.label }}"
|
||||||
|
agents:
|
||||||
|
queue: amd
|
||||||
|
command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}"
|
||||||
|
env:
|
||||||
|
DOCKER_BUILDKIT: "1"
|
||||||
|
soft_fail: true
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
|
- label: "Neuron Test"
|
||||||
|
depends_on: ~
|
||||||
|
agents:
|
||||||
|
queue: neuron
|
||||||
|
command: bash .buildkite/run-neuron-test.sh
|
||||||
|
soft_fail: false
|
||||||
|
|
||||||
|
- label: "Intel Test"
|
||||||
|
depends_on: ~
|
||||||
|
agents:
|
||||||
|
queue: intel
|
||||||
|
command: bash .buildkite/run-cpu-test.sh
|
||||||
|
|
||||||
|
{% for step in steps %}
|
||||||
|
- label: "{{ step.label }}"
|
||||||
|
agents:
|
||||||
|
queue: kubernetes
|
||||||
|
soft_fail: {{ step.soft_fail or false }}
|
||||||
|
{% if step.parallelism %}
|
||||||
|
parallelism: {{ step.parallelism }}
|
||||||
|
{% endif %}
|
||||||
|
retry:
|
||||||
|
automatic:
|
||||||
|
- exit_status: -1 # Agent was lost
|
||||||
|
limit: 5
|
||||||
|
- exit_status: -10 # Agent was lost
|
||||||
|
limit: 5
|
||||||
|
plugins:
|
||||||
|
- kubernetes:
|
||||||
|
podSpec:
|
||||||
|
{% if step.num_gpus %}
|
||||||
|
priorityClassName: gpu-priority-cls-{{ step.num_gpus }}
|
||||||
|
{% endif %}
|
||||||
|
volumes:
|
||||||
|
- name: dshm
|
||||||
|
emptyDir:
|
||||||
|
medium: Memory
|
||||||
|
containers:
|
||||||
|
- image: "{{ docker_image }}"
|
||||||
|
command: ["bash"]
|
||||||
|
args:
|
||||||
|
- '-c'
|
||||||
|
- "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'"
|
||||||
|
{% if not step.no_gpu %}
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
|
||||||
|
limits:
|
||||||
|
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
|
||||||
|
{% endif %}
|
||||||
|
env:
|
||||||
|
- name: VLLM_USAGE_SOURCE
|
||||||
|
value: ci-test
|
||||||
|
- name: HF_TOKEN
|
||||||
|
valueFrom:
|
||||||
|
secretKeyRef:
|
||||||
|
name: hf-token-secret
|
||||||
|
key: token
|
||||||
|
volumeMounts:
|
||||||
|
- mountPath: /dev/shm
|
||||||
|
name: dshm
|
||||||
|
{% endfor %}
|
||||||
26
.clang-format
Normal file
26
.clang-format
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
BasedOnStyle: Google
|
||||||
|
UseTab: Never
|
||||||
|
IndentWidth: 2
|
||||||
|
ColumnLimit: 80
|
||||||
|
|
||||||
|
# Force pointers to the type for C++.
|
||||||
|
DerivePointerAlignment: false
|
||||||
|
PointerAlignment: Left
|
||||||
|
|
||||||
|
# Reordering #include statements can (and currently will) introduce errors
|
||||||
|
SortIncludes: false
|
||||||
|
|
||||||
|
# Style choices
|
||||||
|
AlignConsecutiveAssignments: false
|
||||||
|
AlignConsecutiveDeclarations: false
|
||||||
|
IndentPPDirectives: BeforeHash
|
||||||
|
|
||||||
|
IncludeCategories:
|
||||||
|
- Regex: '^<'
|
||||||
|
Priority: 4
|
||||||
|
- Regex: '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/'
|
||||||
|
Priority: 3
|
||||||
|
- Regex: '^"(qoda|\.\.)/'
|
||||||
|
Priority: 2
|
||||||
|
- Regex: '.*'
|
||||||
|
Priority: 1
|
||||||
1
.dockerignore
Normal file
1
.dockerignore
Normal file
@ -0,0 +1 @@
|
|||||||
|
vllm/*.so
|
||||||
22
.github/ISSUE_TEMPLATE/100-documentation.yml
vendored
Normal file
22
.github/ISSUE_TEMPLATE/100-documentation.yml
vendored
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
name: 📚 Documentation
|
||||||
|
description: Report an issue related to https://docs.vllm.ai/
|
||||||
|
title: "[Doc]: "
|
||||||
|
labels: ["documentation"]
|
||||||
|
|
||||||
|
body:
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: 📚 The doc issue
|
||||||
|
description: >
|
||||||
|
A clear and concise description of what content in https://docs.vllm.ai/ is an issue.
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: Suggest a potential alternative/fix
|
||||||
|
description: >
|
||||||
|
Tell us how we could improve the documentation in this regard.
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
Thanks for contributing 🎉!
|
||||||
40
.github/ISSUE_TEMPLATE/200-installation.yml
vendored
Normal file
40
.github/ISSUE_TEMPLATE/200-installation.yml
vendored
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
name: 🛠️ Installation
|
||||||
|
description: Report an issue here when you hit errors during installation.
|
||||||
|
title: "[Installation]: "
|
||||||
|
labels: ["installation"]
|
||||||
|
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: Your current environment
|
||||||
|
description: |
|
||||||
|
Please run the following and paste the output below.
|
||||||
|
```sh
|
||||||
|
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||||
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
|
python collect_env.py
|
||||||
|
```
|
||||||
|
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
||||||
|
value: |
|
||||||
|
```text
|
||||||
|
The output of `python collect_env.py`
|
||||||
|
```
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: How you are installing vllm
|
||||||
|
description: |
|
||||||
|
Paste the full command you are trying to execute.
|
||||||
|
value: |
|
||||||
|
```sh
|
||||||
|
pip install -vvv vllm
|
||||||
|
```
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
Thanks for contributing 🎉!
|
||||||
38
.github/ISSUE_TEMPLATE/300-usage.yml
vendored
Normal file
38
.github/ISSUE_TEMPLATE/300-usage.yml
vendored
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
name: 💻 Usage
|
||||||
|
description: Raise an issue here if you don't know how to use vllm.
|
||||||
|
title: "[Usage]: "
|
||||||
|
labels: ["usage"]
|
||||||
|
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: Your current environment
|
||||||
|
description: |
|
||||||
|
Please run the following and paste the output below.
|
||||||
|
```sh
|
||||||
|
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||||
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
|
python collect_env.py
|
||||||
|
```
|
||||||
|
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
||||||
|
value: |
|
||||||
|
```text
|
||||||
|
The output of `python collect_env.py`
|
||||||
|
```
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: How would you like to use vllm
|
||||||
|
description: |
|
||||||
|
A detailed description of how you want to use vllm.
|
||||||
|
value: |
|
||||||
|
I want to run inference of a [specific model](put link here). I don't know how to integrate it with vllm.
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
Thanks for contributing 🎉!
|
||||||
86
.github/ISSUE_TEMPLATE/400-bug report.yml
vendored
Normal file
86
.github/ISSUE_TEMPLATE/400-bug report.yml
vendored
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
name: 🐛 Bug report
|
||||||
|
description: Raise an issue here if you find a bug.
|
||||||
|
title: "[Bug]: "
|
||||||
|
labels: ["bug"]
|
||||||
|
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: Your current environment
|
||||||
|
description: |
|
||||||
|
Please run the following and paste the output below.
|
||||||
|
```sh
|
||||||
|
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||||
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
|
python collect_env.py
|
||||||
|
```
|
||||||
|
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
||||||
|
value: |
|
||||||
|
```text
|
||||||
|
The output of `python collect_env.py`
|
||||||
|
```
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: 🐛 Describe the bug
|
||||||
|
description: |
|
||||||
|
Please provide a clear and concise description of what the bug is.
|
||||||
|
|
||||||
|
If relevant, add a minimal example so that we can reproduce the error by running the code. It is very important for the snippet to be as succinct (minimal) as possible, so please take time to trim down any irrelevant code to help us debug efficiently. We are going to copy-paste your code and we expect to get the same result as you did: avoid any external data, and include the relevant imports, etc. For example:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
|
prompts = [
|
||||||
|
"Hello, my name is",
|
||||||
|
"The president of the United States is",
|
||||||
|
"The capital of France is",
|
||||||
|
"The future of AI is",
|
||||||
|
]
|
||||||
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||||
|
|
||||||
|
llm = LLM(model="facebook/opt-125m")
|
||||||
|
|
||||||
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
|
|
||||||
|
# Print the outputs.
|
||||||
|
for output in outputs:
|
||||||
|
prompt = output.prompt
|
||||||
|
generated_text = output.outputs[0].text
|
||||||
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
|
```
|
||||||
|
|
||||||
|
If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
|
||||||
|
|
||||||
|
Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
|
||||||
|
|
||||||
|
Please set the environment variable `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging to help debugging potential issues.
|
||||||
|
|
||||||
|
If you experienced crashes or hangs, it would be helpful to run vllm with `export VLLM_TRACE_FUNCTION=1` . All the function calls in vllm will be recorded. Inspect these log files, and tell which function crashes or hangs.
|
||||||
|
placeholder: |
|
||||||
|
A clear and concise description of what the bug is.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Sample code to reproduce the problem
|
||||||
|
```
|
||||||
|
|
||||||
|
```
|
||||||
|
The error message you got, with the full traceback.
|
||||||
|
```
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
⚠️ Please separate bugs of `transformers` implementation or usage from bugs of `vllm`. If you think anything is wrong with the models' output:
|
||||||
|
|
||||||
|
- Try the counterpart of `transformers` first. If the error appears, please go to [their issues](https://github.com/huggingface/transformers/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc).
|
||||||
|
|
||||||
|
- If the error only appears in vllm, please provide the detailed script of how you run `transformers` and `vllm`, also highlight the difference and what you expect.
|
||||||
|
|
||||||
|
Thanks for contributing 🎉!
|
||||||
31
.github/ISSUE_TEMPLATE/500-feature request.yml
vendored
Normal file
31
.github/ISSUE_TEMPLATE/500-feature request.yml
vendored
Normal file
@ -0,0 +1,31 @@
|
|||||||
|
name: 🚀 Feature request
|
||||||
|
description: Submit a proposal/request for a new vllm feature
|
||||||
|
title: "[Feature]: "
|
||||||
|
labels: ["feature request"]
|
||||||
|
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: 🚀 The feature, motivation and pitch
|
||||||
|
description: >
|
||||||
|
A clear and concise description of the feature proposal. Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link here too.
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: Alternatives
|
||||||
|
description: >
|
||||||
|
A description of any alternative solutions or features you've considered, if any.
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: Additional context
|
||||||
|
description: >
|
||||||
|
Add any other context or screenshots about the feature request.
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
Thanks for contributing 🎉!
|
||||||
33
.github/ISSUE_TEMPLATE/600-new model.yml
vendored
Normal file
33
.github/ISSUE_TEMPLATE/600-new model.yml
vendored
Normal file
@ -0,0 +1,33 @@
|
|||||||
|
name: 🤗 Support request for a new model from huggingface
|
||||||
|
description: Submit a proposal/request for a new model from huggingface
|
||||||
|
title: "[New Model]: "
|
||||||
|
labels: ["new model"]
|
||||||
|
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||||
|
|
||||||
|
#### We also highly recommend you read https://docs.vllm.ai/en/latest/models/adding_model.html first to understand how to add a new model.
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: The model to consider.
|
||||||
|
description: >
|
||||||
|
A huggingface url, pointing to the model, e.g. https://huggingface.co/openai-community/gpt2 .
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: The closest model vllm already supports.
|
||||||
|
description: >
|
||||||
|
Here is the list of models already supported by vllm: https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models . Which model is the most similar to the model you want to add support for?
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: What's your difficulty of supporting the model you want?
|
||||||
|
description: >
|
||||||
|
For example, any new operators or new architecture?
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
Thanks for contributing 🎉!
|
||||||
52
.github/ISSUE_TEMPLATE/700-performance discussion.yml
vendored
Normal file
52
.github/ISSUE_TEMPLATE/700-performance discussion.yml
vendored
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
name: ⚡ Discussion on the performance of vllm
|
||||||
|
description: Submit a proposal/discussion about the performance of vllm
|
||||||
|
title: "[Performance]: "
|
||||||
|
labels: ["performance"]
|
||||||
|
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: Proposal to improve performance
|
||||||
|
description: >
|
||||||
|
How do you plan to improve vllm's performance?
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: Report of performance regression
|
||||||
|
description: >
|
||||||
|
Please provide detailed description of performance comparison to confirm the regression. You may want to run the benchmark script at https://github.com/vllm-project/vllm/tree/main/benchmarks .
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: Misc discussion on performance
|
||||||
|
description: >
|
||||||
|
Anything about the performance.
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: Your current environment (if you think it is necessary)
|
||||||
|
description: |
|
||||||
|
Please run the following and paste the output below.
|
||||||
|
```sh
|
||||||
|
wget https://raw.githubusercontent.com/vllm-project/vllm/main/collect_env.py
|
||||||
|
# For security purposes, please feel free to check the contents of collect_env.py before running it.
|
||||||
|
python collect_env.py
|
||||||
|
```
|
||||||
|
It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
|
||||||
|
value: |
|
||||||
|
```text
|
||||||
|
The output of `python collect_env.py`
|
||||||
|
```
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
Thanks for contributing 🎉!
|
||||||
49
.github/ISSUE_TEMPLATE/750-RFC.yml
vendored
Normal file
49
.github/ISSUE_TEMPLATE/750-RFC.yml
vendored
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
name: 💬 Request for comments (RFC).
|
||||||
|
description: Ask for feedback on major architectural changes or design choices.
|
||||||
|
title: "[RFC]: "
|
||||||
|
labels: ["RFC"]
|
||||||
|
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
#### Please take a look at previous [RFCs](https://github.com/vllm-project/vllm/issues?q=label%3ARFC+sort%3Aupdated-desc) for reference.
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: Motivation.
|
||||||
|
description: >
|
||||||
|
The motivation of the RFC.
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: Proposed Change.
|
||||||
|
description: >
|
||||||
|
The proposed change of the RFC.
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: Feedback Period.
|
||||||
|
description: >
|
||||||
|
The feedback period of the RFC. Usually at least one week.
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: CC List.
|
||||||
|
description: >
|
||||||
|
The list of people you want to CC.
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: Any Other Things.
|
||||||
|
description: >
|
||||||
|
Any other things you would like to mention.
|
||||||
|
validations:
|
||||||
|
required: false
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
Thanks for contributing 🎉!
|
||||||
21
.github/ISSUE_TEMPLATE/800-misc discussion.yml
vendored
Normal file
21
.github/ISSUE_TEMPLATE/800-misc discussion.yml
vendored
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
name: 🎲 Misc/random discussions that do not fit into the above categories.
|
||||||
|
description: Submit a discussion as you like. Note that developers are heavily overloaded and we mainly rely on community users to answer these issues.
|
||||||
|
title: "[Misc]: "
|
||||||
|
labels: ["misc"]
|
||||||
|
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: Anything you want to discuss about vllm.
|
||||||
|
description: >
|
||||||
|
Anything you want to discuss about vllm.
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
Thanks for contributing 🎉!
|
||||||
1
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
1
.github/ISSUE_TEMPLATE/config.yml
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
blank_issues_enabled: false
|
||||||
64
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
64
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
FILL IN THE PR DESCRIPTION HERE
|
||||||
|
|
||||||
|
FIX #xxxx (*link existing issues this PR will resolve*)
|
||||||
|
|
||||||
|
**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
<details>
|
||||||
|
<!-- inside this <details> section, markdown rendering does not work, so we use raw html here. -->
|
||||||
|
<summary><b> PR Checklist (Click to Expand) </b></summary>
|
||||||
|
|
||||||
|
<p>Thank you for your contribution to vLLM! Before submitting the pull request, please ensure the PR meets the following criteria. This helps vLLM maintain the code quality and improve the efficiency of the review process.</p>
|
||||||
|
|
||||||
|
<h3>PR Title and Classification</h3>
|
||||||
|
<p>Only specific types of PRs will be reviewed. The PR title is prefixed appropriately to indicate the type of change. Please use one of the following:</p>
|
||||||
|
<ul>
|
||||||
|
<li><code>[Bugfix]</code> for bug fixes.</li>
|
||||||
|
<li><code>[CI/Build]</code> for build or continuous integration improvements.</li>
|
||||||
|
<li><code>[Doc]</code> for documentation fixes and improvements.</li>
|
||||||
|
<li><code>[Model]</code> for adding a new model or improving an existing model. Model name should appear in the title.</li>
|
||||||
|
<li><code>[Frontend]</code> For changes on the vLLM frontend (e.g., OpenAI API server, <code>LLM</code> class, etc.) </li>
|
||||||
|
<li><code>[Kernel]</code> for changes affecting CUDA kernels or other compute kernels.</li>
|
||||||
|
<li><code>[Core]</code> for changes in the core vLLM logic (e.g., <code>LLMEngine</code>, <code>AsyncLLMEngine</code>, <code>Scheduler</code>, etc.)</li>
|
||||||
|
<li><code>[Hardware][Vendor]</code> for hardware-specific changes. Vendor name should appear in the prefix (e.g., <code>[Hardware][AMD]</code>).</li>
|
||||||
|
<li><code>[Misc]</code> for PRs that do not fit the above categories. Please use this sparingly.</li>
|
||||||
|
</ul>
|
||||||
|
<p><strong>Note:</strong> If the PR spans more than one category, please include all relevant prefixes.</p>
|
||||||
|
|
||||||
|
<h3>Code Quality</h3>
|
||||||
|
|
||||||
|
<p>The PR need to meet the following code quality standards:</p>
|
||||||
|
|
||||||
|
<ul>
|
||||||
|
<li>We adhere to <a href="https://google.github.io/styleguide/pyguide.html">Google Python style guide</a> and <a href="https://google.github.io/styleguide/cppguide.html">Google C++ style guide</a>.</li>
|
||||||
|
<li>Pass all linter checks. Please use <a href="https://github.com/vllm-project/vllm/blob/main/format.sh"><code>format.sh</code></a> to format your code.</li>
|
||||||
|
<li>The code need to be well-documented to ensure future contributors can easily understand the code.</li>
|
||||||
|
<li>Include sufficient tests to ensure the project to stay correct and robust. This includes both unit tests and integration tests.</li>
|
||||||
|
<li>Please add documentation to <code>docs/source/</code> if the PR modifies the user-facing behaviors of vLLM. It helps vLLM user understand and utilize the new features or changes.</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<h3>Notes for Large Changes</h3>
|
||||||
|
<p>Please keep the changes as concise as possible. For major architectural changes (>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue (RFC) discussing the technical design and justification. Otherwise, we will tag it with <code>rfc-required</code> and might not go through the PR.</p>
|
||||||
|
|
||||||
|
<h3>What to Expect for the Reviews</h3>
|
||||||
|
|
||||||
|
<p>The goal of the vLLM team is to be a <i>transparent reviewing machine</i>. We would like to make the review process transparent and efficient and make sure no contributor feel confused or frustrated. However, the vLLM team is small, so we need to prioritize some PRs over others. Here is what you can expect from the review process: </p>
|
||||||
|
|
||||||
|
<ul>
|
||||||
|
<li> After the PR is submitted, the PR will be assigned to a reviewer. Every reviewer will pick up the PRs based on their expertise and availability.</li>
|
||||||
|
<li> After the PR is assigned, the reviewer will provide status update every 2-3 days. If the PR is not reviewed within 7 days, please feel free to ping the reviewer or the vLLM team.</li>
|
||||||
|
<li> After the review, the reviewer will put an <code> action-required</code> label on the PR if there are changes required. The contributor should address the comments and ping the reviewer to re-review the PR.</li>
|
||||||
|
<li> Please respond to all comments within a reasonable time frame. If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.
|
||||||
|
</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<h3>Thank You</h3>
|
||||||
|
|
||||||
|
<p> Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. Your contributions make vLLM a great tool for everyone! </p>
|
||||||
|
|
||||||
|
|
||||||
|
</details>
|
||||||
|
|
||||||
|
|
||||||
42
.github/workflows/clang-format.yml
vendored
Normal file
42
.github/workflows/clang-format.yml
vendored
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
name: clang-format
|
||||||
|
|
||||||
|
on:
|
||||||
|
# Trigger the workflow on push or pull request,
|
||||||
|
# but only for the main branch
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
pull_request:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
clang-format:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
python-version: ["3.11"]
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
|
uses: actions/setup-python@v2
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python-version }}
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
pip install clang-format==18.1.5
|
||||||
|
- name: Running clang-format
|
||||||
|
run: |
|
||||||
|
EXCLUDES=(
|
||||||
|
'csrc/moe/topk_softmax_kernels.cu'
|
||||||
|
'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu'
|
||||||
|
'csrc/punica/bgmv/bgmv_config.h'
|
||||||
|
'csrc/punica/bgmv/bgmv_impl.cuh'
|
||||||
|
'csrc/punica/bgmv/vec_dtypes.cuh'
|
||||||
|
'csrc/punica/punica_ops.cu'
|
||||||
|
'csrc/punica/type_convert.h'
|
||||||
|
)
|
||||||
|
find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
|
||||||
|
| grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
|
||||||
|
| xargs clang-format --dry-run --Werror
|
||||||
51
.github/workflows/mypy.yaml
vendored
Normal file
51
.github/workflows/mypy.yaml
vendored
Normal file
@ -0,0 +1,51 @@
|
|||||||
|
name: mypy
|
||||||
|
|
||||||
|
on:
|
||||||
|
# Trigger the workflow on push or pull request,
|
||||||
|
# but only for the main branch
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
pull_request:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
ruff:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
python-version: ["3.8", "3.9", "3.10", "3.11"]
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v2
|
||||||
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
|
uses: actions/setup-python@v2
|
||||||
|
with:
|
||||||
|
python-version: ${{ matrix.python-version }}
|
||||||
|
- name: Install dependencies
|
||||||
|
run: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
pip install mypy==1.9.0
|
||||||
|
pip install types-setuptools
|
||||||
|
pip install types-PyYAML
|
||||||
|
pip install types-requests
|
||||||
|
pip install types-setuptools
|
||||||
|
- name: Mypy
|
||||||
|
run: |
|
||||||
|
mypy vllm/attention --config-file pyproject.toml
|
||||||
|
mypy vllm/core --config-file pyproject.toml
|
||||||
|
mypy vllm/distributed --config-file pyproject.toml
|
||||||
|
mypy vllm/entrypoints --config-file pyproject.toml
|
||||||
|
mypy vllm/executor --config-file pyproject.toml
|
||||||
|
mypy vllm/multimodal --config-file pyproject.toml
|
||||||
|
mypy vllm/usage --config-file pyproject.toml
|
||||||
|
mypy vllm/*.py --config-file pyproject.toml
|
||||||
|
mypy vllm/transformers_utils --config-file pyproject.toml
|
||||||
|
mypy vllm/engine --config-file pyproject.toml
|
||||||
|
mypy vllm/worker --config-file pyproject.toml
|
||||||
|
mypy vllm/spec_decode --config-file pyproject.toml
|
||||||
|
mypy vllm/model_executor --config-file pyproject.toml
|
||||||
|
mypy vllm/lora --config-file pyproject.toml
|
||||||
|
mypy vllm/logging --config-file pyproject.toml
|
||||||
|
mypy vllm/model_executor --config-file pyproject.toml
|
||||||
|
|
||||||
10
.github/workflows/publish.yml
vendored
10
.github/workflows/publish.yml
vendored
@ -49,13 +49,19 @@ jobs:
|
|||||||
matrix:
|
matrix:
|
||||||
os: ['ubuntu-20.04']
|
os: ['ubuntu-20.04']
|
||||||
python-version: ['3.8', '3.9', '3.10', '3.11']
|
python-version: ['3.8', '3.9', '3.10', '3.11']
|
||||||
pytorch-version: ['2.1.0']
|
pytorch-version: ['2.3.0'] # Must be the most recent version that meets requirements-cuda.txt.
|
||||||
cuda-version: ['11.8', '12.1']
|
cuda-version: ['11.8', '12.1']
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v3
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Setup ccache
|
||||||
|
uses: hendrikmuhs/ccache-action@v1.2
|
||||||
|
with:
|
||||||
|
create-symlink: true
|
||||||
|
key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
|
||||||
|
|
||||||
- name: Set up Linux Env
|
- name: Set up Linux Env
|
||||||
if: ${{ runner.os == 'Linux' }}
|
if: ${{ runner.os == 'Linux' }}
|
||||||
run: |
|
run: |
|
||||||
@ -76,6 +82,8 @@ jobs:
|
|||||||
|
|
||||||
- name: Build wheel
|
- name: Build wheel
|
||||||
shell: bash
|
shell: bash
|
||||||
|
env:
|
||||||
|
CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
|
||||||
run: |
|
run: |
|
||||||
bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
|
bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
|
||||||
wheel_name=$(ls dist/*whl | xargs -n 1 basename)
|
wheel_name=$(ls dist/*whl | xargs -n 1 basename)
|
||||||
|
|||||||
@ -1,4 +1,4 @@
|
|||||||
name: pylint
|
name: ruff
|
||||||
|
|
||||||
on:
|
on:
|
||||||
# Trigger the workflow on push or pull request,
|
# Trigger the workflow on push or pull request,
|
||||||
@ -11,11 +11,11 @@ on:
|
|||||||
- main
|
- main
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
pylint:
|
ruff:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.10"]
|
python-version: ["3.8", "3.9", "3.10", "3.11"]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
@ -25,7 +25,13 @@ jobs:
|
|||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: |
|
run: |
|
||||||
python -m pip install --upgrade pip
|
python -m pip install --upgrade pip
|
||||||
pip install pylint==2.8.2
|
pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2
|
||||||
- name: Analysing the code with pylint
|
- name: Analysing the code with ruff
|
||||||
run: |
|
run: |
|
||||||
pylint vllm tests
|
ruff .
|
||||||
|
- name: Spelling check with codespell
|
||||||
|
run: |
|
||||||
|
codespell --toml pyproject.toml
|
||||||
|
- name: Run isort
|
||||||
|
run: |
|
||||||
|
isort . --check-only
|
||||||
7
.github/workflows/scripts/build.sh
vendored
7
.github/workflows/scripts/build.sh
vendored
@ -9,10 +9,13 @@ LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
|
|||||||
|
|
||||||
# Install requirements
|
# Install requirements
|
||||||
$python_executable -m pip install wheel packaging
|
$python_executable -m pip install wheel packaging
|
||||||
$python_executable -m pip install -r requirements.txt
|
$python_executable -m pip install -r requirements-cuda.txt
|
||||||
|
|
||||||
# Limit the number of parallel jobs to avoid OOM
|
# Limit the number of parallel jobs to avoid OOM
|
||||||
export MAX_JOBS=1
|
export MAX_JOBS=1
|
||||||
|
# Make sure punica is built for the release (for LoRA)
|
||||||
|
export VLLM_INSTALL_PUNICA_KERNELS=1
|
||||||
|
# Make sure release wheels are built for the following architectures
|
||||||
|
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
|
||||||
# Build
|
# Build
|
||||||
$python_executable setup.py bdist_wheel --dist-dir=dist
|
$python_executable setup.py bdist_wheel --dist-dir=dist
|
||||||
|
|||||||
2
.github/workflows/scripts/create_release.js
vendored
2
.github/workflows/scripts/create_release.js
vendored
@ -8,7 +8,7 @@ module.exports = async (github, context, core) => {
|
|||||||
generate_release_notes: true,
|
generate_release_notes: true,
|
||||||
name: process.env.RELEASE_TAG,
|
name: process.env.RELEASE_TAG,
|
||||||
owner: context.repo.owner,
|
owner: context.repo.owner,
|
||||||
prerelease: false,
|
prerelease: true,
|
||||||
repo: context.repo.repo,
|
repo: context.repo.repo,
|
||||||
tag_name: process.env.RELEASE_TAG,
|
tag_name: process.env.RELEASE_TAG,
|
||||||
});
|
});
|
||||||
|
|||||||
4
.github/workflows/yapf.yml
vendored
4
.github/workflows/yapf.yml
vendored
@ -14,7 +14,7 @@ jobs:
|
|||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.10"]
|
python-version: ["3.8", "3.9", "3.10", "3.11"]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v2
|
- uses: actions/checkout@v2
|
||||||
- name: Set up Python ${{ matrix.python-version }}
|
- name: Set up Python ${{ matrix.python-version }}
|
||||||
@ -28,4 +28,4 @@ jobs:
|
|||||||
pip install toml==0.10.2
|
pip install toml==0.10.2
|
||||||
- name: Running yapf
|
- name: Running yapf
|
||||||
run: |
|
run: |
|
||||||
yapf --diff --recursive vllm tests
|
yapf --diff --recursive .
|
||||||
|
|||||||
10
.gitignore
vendored
10
.gitignore
vendored
@ -70,6 +70,8 @@ instance/
|
|||||||
|
|
||||||
# Sphinx documentation
|
# Sphinx documentation
|
||||||
docs/_build/
|
docs/_build/
|
||||||
|
docs/source/getting_started/examples/*.rst
|
||||||
|
!**/*.template.rst
|
||||||
|
|
||||||
# PyBuilder
|
# PyBuilder
|
||||||
.pybuilder/
|
.pybuilder/
|
||||||
@ -177,3 +179,11 @@ _build/
|
|||||||
# vim swap files
|
# vim swap files
|
||||||
*.swo
|
*.swo
|
||||||
*.swp
|
*.swp
|
||||||
|
|
||||||
|
# hip files generated by PyTorch
|
||||||
|
*.hip
|
||||||
|
*_hip*
|
||||||
|
hip_compat.h
|
||||||
|
|
||||||
|
# Benchmark dataset
|
||||||
|
*.json
|
||||||
|
|||||||
434
.pylintrc
434
.pylintrc
@ -1,434 +0,0 @@
|
|||||||
# This Pylint rcfile contains a best-effort configuration to uphold the
|
|
||||||
# best-practices and style described in the Google Python style guide:
|
|
||||||
# https://google.github.io/styleguide/pyguide.html
|
|
||||||
#
|
|
||||||
# Its canonical open-source location is:
|
|
||||||
# https://google.github.io/styleguide/pylintrc
|
|
||||||
|
|
||||||
[MASTER]
|
|
||||||
|
|
||||||
# Files or directories to be skipped. They should be base names, not paths.
|
|
||||||
ignore=docs
|
|
||||||
|
|
||||||
# Files or directories matching the regex patterns are skipped. The regex
|
|
||||||
# matches against base names, not paths.
|
|
||||||
ignore-patterns=
|
|
||||||
|
|
||||||
# Pickle collected data for later comparisons.
|
|
||||||
persistent=no
|
|
||||||
|
|
||||||
# List of plugins (as comma separated values of python modules names) to load,
|
|
||||||
# usually to register additional checkers.
|
|
||||||
load-plugins=
|
|
||||||
|
|
||||||
# Use multiple processes to speed up Pylint.
|
|
||||||
jobs=4
|
|
||||||
|
|
||||||
# Allow loading of arbitrary C extensions. Extensions are imported into the
|
|
||||||
# active Python interpreter and may run arbitrary code.
|
|
||||||
unsafe-load-any-extension=no
|
|
||||||
|
|
||||||
|
|
||||||
[MESSAGES CONTROL]
|
|
||||||
|
|
||||||
# Only show warnings with the listed confidence levels. Leave empty to show
|
|
||||||
# all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
|
|
||||||
confidence=
|
|
||||||
|
|
||||||
# Enable the message, report, category or checker with the given id(s). You can
|
|
||||||
# either give multiple identifier separated by comma (,) or put this option
|
|
||||||
# multiple time (only on the command line, not in the configuration file where
|
|
||||||
# it should appear only once). See also the "--disable" option for examples.
|
|
||||||
#enable=
|
|
||||||
|
|
||||||
# Disable the message, report, category or checker with the given id(s). You
|
|
||||||
# can either give multiple identifiers separated by comma (,) or put this
|
|
||||||
# option multiple times (only on the command line, not in the configuration
|
|
||||||
# file where it should appear only once).You can also use "--disable=all" to
|
|
||||||
# disable everything first and then reenable specific checks. For example, if
|
|
||||||
# you want to run only the similarities checker, you can use "--disable=all
|
|
||||||
# --enable=similarities". If you want to run only the classes checker, but have
|
|
||||||
# no Warning level messages displayed, use"--disable=all --enable=classes
|
|
||||||
# --disable=W"
|
|
||||||
disable=abstract-method,
|
|
||||||
apply-builtin,
|
|
||||||
arguments-differ,
|
|
||||||
attribute-defined-outside-init,
|
|
||||||
backtick,
|
|
||||||
bad-option-value,
|
|
||||||
basestring-builtin,
|
|
||||||
buffer-builtin,
|
|
||||||
c-extension-no-member,
|
|
||||||
consider-using-enumerate,
|
|
||||||
cmp-builtin,
|
|
||||||
cmp-method,
|
|
||||||
coerce-builtin,
|
|
||||||
coerce-method,
|
|
||||||
delslice-method,
|
|
||||||
div-method,
|
|
||||||
duplicate-code,
|
|
||||||
eq-without-hash,
|
|
||||||
execfile-builtin,
|
|
||||||
file-builtin,
|
|
||||||
filter-builtin-not-iterating,
|
|
||||||
fixme,
|
|
||||||
getslice-method,
|
|
||||||
global-statement,
|
|
||||||
hex-method,
|
|
||||||
idiv-method,
|
|
||||||
implicit-str-concat-in-sequence,
|
|
||||||
import-error,
|
|
||||||
import-self,
|
|
||||||
import-star-module-level,
|
|
||||||
inconsistent-return-statements,
|
|
||||||
input-builtin,
|
|
||||||
intern-builtin,
|
|
||||||
invalid-str-codec,
|
|
||||||
locally-disabled,
|
|
||||||
logging-fstring-interpolation, # added by vLLM
|
|
||||||
logging-not-lazy, # added by vLLM
|
|
||||||
long-builtin,
|
|
||||||
long-suffix,
|
|
||||||
map-builtin-not-iterating,
|
|
||||||
misplaced-comparison-constant,
|
|
||||||
missing-class-docstring, # TODO (vLLM): enable
|
|
||||||
missing-function-docstring,
|
|
||||||
missing-module-docstring, # TODO (vLLM): enable
|
|
||||||
metaclass-assignment,
|
|
||||||
next-method-called,
|
|
||||||
next-method-defined,
|
|
||||||
no-absolute-import,
|
|
||||||
no-else-break,
|
|
||||||
no-else-continue,
|
|
||||||
no-else-raise,
|
|
||||||
no-else-return,
|
|
||||||
no-init, # added
|
|
||||||
no-member,
|
|
||||||
no-name-in-module,
|
|
||||||
no-self-use,
|
|
||||||
nonzero-method,
|
|
||||||
oct-method,
|
|
||||||
old-division,
|
|
||||||
old-ne-operator,
|
|
||||||
old-octal-literal,
|
|
||||||
old-raise-syntax,
|
|
||||||
parameter-unpacking,
|
|
||||||
print-statement,
|
|
||||||
raising-string,
|
|
||||||
range-builtin-not-iterating,
|
|
||||||
raw_input-builtin,
|
|
||||||
rdiv-method,
|
|
||||||
reduce-builtin,
|
|
||||||
relative-import,
|
|
||||||
reload-builtin,
|
|
||||||
round-builtin,
|
|
||||||
setslice-method,
|
|
||||||
signature-differs,
|
|
||||||
standarderror-builtin,
|
|
||||||
suppressed-message,
|
|
||||||
sys-max-int,
|
|
||||||
too-few-public-methods,
|
|
||||||
too-many-ancestors,
|
|
||||||
too-many-arguments,
|
|
||||||
too-many-boolean-expressions,
|
|
||||||
too-many-branches,
|
|
||||||
too-many-instance-attributes,
|
|
||||||
too-many-locals,
|
|
||||||
too-many-nested-blocks,
|
|
||||||
too-many-public-methods,
|
|
||||||
too-many-return-statements,
|
|
||||||
too-many-statements,
|
|
||||||
trailing-newlines,
|
|
||||||
unichr-builtin,
|
|
||||||
unicode-builtin,
|
|
||||||
unnecessary-pass,
|
|
||||||
unpacking-in-except,
|
|
||||||
unspecified-encoding,
|
|
||||||
useless-else-on-loop,
|
|
||||||
useless-object-inheritance,
|
|
||||||
useless-suppression,
|
|
||||||
using-cmp-argument,
|
|
||||||
wrong-import-order,
|
|
||||||
xrange-builtin,
|
|
||||||
zip-builtin-not-iterating,
|
|
||||||
|
|
||||||
|
|
||||||
[REPORTS]
|
|
||||||
|
|
||||||
# Set the output format. Available formats are text, parseable, colorized, msvs
|
|
||||||
# (visual studio) and html. You can also give a reporter class, eg
|
|
||||||
# mypackage.mymodule.MyReporterClass.
|
|
||||||
output-format=text
|
|
||||||
|
|
||||||
# Tells whether to display a full report or only the messages
|
|
||||||
reports=no
|
|
||||||
|
|
||||||
# Python expression which should return a note less than 10 (10 is the highest
|
|
||||||
# note). You have access to the variables errors warning, statement which
|
|
||||||
# respectively contain the number of errors / warnings messages and the total
|
|
||||||
# number of statements analyzed. This is used by the global evaluation report
|
|
||||||
# (RP0004).
|
|
||||||
evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
|
|
||||||
|
|
||||||
# Template used to display messages. This is a python new-style format string
|
|
||||||
# used to format the message information. See doc for all details
|
|
||||||
#msg-template=
|
|
||||||
|
|
||||||
|
|
||||||
[BASIC]
|
|
||||||
|
|
||||||
# Good variable names which should always be accepted, separated by a comma
|
|
||||||
good-names=main,_
|
|
||||||
|
|
||||||
# Bad variable names which should always be refused, separated by a comma
|
|
||||||
bad-names=
|
|
||||||
|
|
||||||
# Colon-delimited sets of names that determine each other's naming style when
|
|
||||||
# the name regexes allow several styles.
|
|
||||||
name-group=
|
|
||||||
|
|
||||||
# Include a hint for the correct naming format with invalid-name
|
|
||||||
include-naming-hint=no
|
|
||||||
|
|
||||||
# List of decorators that produce properties, such as abc.abstractproperty. Add
|
|
||||||
# to this list to register other decorators that produce valid properties.
|
|
||||||
property-classes=abc.abstractproperty,cached_property.cached_property,cached_property.threaded_cached_property,cached_property.cached_property_with_ttl,cached_property.threaded_cached_property_with_ttl
|
|
||||||
|
|
||||||
# Regular expression matching correct function names
|
|
||||||
function-rgx=^(?:(?P<exempt>setUp|tearDown|setUpModule|tearDownModule)|(?P<camel_case>_?[A-Z][a-zA-Z0-9]*)|(?P<snake_case>_?[a-z][a-z0-9_]*))$
|
|
||||||
|
|
||||||
# Regular expression matching correct variable names
|
|
||||||
variable-rgx=^[a-z][a-z0-9_]*$
|
|
||||||
|
|
||||||
# Regular expression matching correct constant names
|
|
||||||
const-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
|
|
||||||
|
|
||||||
# Regular expression matching correct attribute names
|
|
||||||
attr-rgx=^_{0,2}[a-z][a-z0-9_]*$
|
|
||||||
|
|
||||||
# Regular expression matching correct argument names
|
|
||||||
argument-rgx=^[a-z][a-z0-9_]*$
|
|
||||||
|
|
||||||
# Regular expression matching correct class attribute names
|
|
||||||
class-attribute-rgx=^(_?[A-Z][A-Z0-9_]*|__[a-z0-9_]+__|_?[a-z][a-z0-9_]*)$
|
|
||||||
|
|
||||||
# Regular expression matching correct inline iteration names
|
|
||||||
inlinevar-rgx=^[a-z][a-z0-9_]*$
|
|
||||||
|
|
||||||
# Regular expression matching correct class names
|
|
||||||
class-rgx=^_?[A-Z][a-zA-Z0-9]*$
|
|
||||||
|
|
||||||
# Regular expression matching correct module names
|
|
||||||
module-rgx=^(_?[a-z][a-z0-9_]*|__init__)$
|
|
||||||
|
|
||||||
# Regular expression matching correct method names
|
|
||||||
method-rgx=(?x)^(?:(?P<exempt>_[a-z0-9_]+__|runTest|setUp|tearDown|setUpTestCase|tearDownTestCase|setupSelf|tearDownClass|setUpClass|(test|assert)_*[A-Z0-9][a-zA-Z0-9_]*|next)|(?P<camel_case>_{0,2}[A-Z][a-zA-Z0-9_]*)|(?P<snake_case>_{0,2}[a-z][a-z0-9_]*))$
|
|
||||||
|
|
||||||
# Regular expression which should only match function or class names that do
|
|
||||||
# not require a docstring.
|
|
||||||
no-docstring-rgx=(__.*__|main|test.*|.*test|.*Test)$
|
|
||||||
|
|
||||||
# Minimum line length for functions/classes that require docstrings, shorter
|
|
||||||
# ones are exempt.
|
|
||||||
docstring-min-length=10
|
|
||||||
|
|
||||||
|
|
||||||
[TYPECHECK]
|
|
||||||
|
|
||||||
# List of decorators that produce context managers, such as
|
|
||||||
# contextlib.contextmanager. Add to this list to register other decorators that
|
|
||||||
# produce valid context managers.
|
|
||||||
contextmanager-decorators=contextlib.contextmanager,contextlib2.contextmanager
|
|
||||||
|
|
||||||
# Tells whether missing members accessed in mixin class should be ignored. A
|
|
||||||
# mixin class is detected if its name ends with "mixin" (case insensitive).
|
|
||||||
ignore-mixin-members=yes
|
|
||||||
|
|
||||||
# List of module names for which member attributes should not be checked
|
|
||||||
# (useful for modules/projects where namespaces are manipulated during runtime
|
|
||||||
# and thus existing member attributes cannot be deduced by static analysis. It
|
|
||||||
# supports qualified module names, as well as Unix pattern matching.
|
|
||||||
ignored-modules=
|
|
||||||
|
|
||||||
# List of class names for which member attributes should not be checked (useful
|
|
||||||
# for classes with dynamically set attributes). This supports the use of
|
|
||||||
# qualified names.
|
|
||||||
ignored-classes=optparse.Values,thread._local,_thread._local
|
|
||||||
|
|
||||||
# List of members which are set dynamically and missed by pylint inference
|
|
||||||
# system, and so shouldn't trigger E1101 when accessed. Python regular
|
|
||||||
# expressions are accepted.
|
|
||||||
generated-members=
|
|
||||||
|
|
||||||
|
|
||||||
[FORMAT]
|
|
||||||
|
|
||||||
# Maximum number of characters on a single line.
|
|
||||||
max-line-length=80
|
|
||||||
|
|
||||||
# TODO(https://github.com/PyCQA/pylint/issues/3352): Direct pylint to exempt
|
|
||||||
# lines made too long by directives to pytype.
|
|
||||||
|
|
||||||
# Regexp for a line that is allowed to be longer than the limit.
|
|
||||||
ignore-long-lines=(?x)(
|
|
||||||
^\s*(\#\ )?<?https?://\S+>?$|
|
|
||||||
^\s*(from\s+\S+\s+)?import\s+.+$)
|
|
||||||
|
|
||||||
# Allow the body of an if to be on the same line as the test if there is no
|
|
||||||
# else.
|
|
||||||
single-line-if-stmt=yes
|
|
||||||
|
|
||||||
# Maximum number of lines in a module
|
|
||||||
max-module-lines=99999
|
|
||||||
|
|
||||||
# String used as indentation unit. The internal Google style guide mandates 2
|
|
||||||
# spaces. Google's externaly-published style guide says 4, consistent with
|
|
||||||
# PEP 8. Here, we use 2 spaces, for conformity with many open-sourced Google
|
|
||||||
# projects (like TensorFlow).
|
|
||||||
indent-string=' '
|
|
||||||
|
|
||||||
# Number of spaces of indent required inside a hanging or continued line.
|
|
||||||
indent-after-paren=4
|
|
||||||
|
|
||||||
# Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
|
|
||||||
expected-line-ending-format=
|
|
||||||
|
|
||||||
|
|
||||||
[MISCELLANEOUS]
|
|
||||||
|
|
||||||
# List of note tags to take in consideration, separated by a comma.
|
|
||||||
notes=TODO
|
|
||||||
|
|
||||||
|
|
||||||
[STRING]
|
|
||||||
|
|
||||||
# This flag controls whether inconsistent-quotes generates a warning when the
|
|
||||||
# character used as a quote delimiter is used inconsistently within a module.
|
|
||||||
check-quote-consistency=yes
|
|
||||||
|
|
||||||
|
|
||||||
[VARIABLES]
|
|
||||||
|
|
||||||
# Tells whether we should check for unused import in __init__ files.
|
|
||||||
init-import=no
|
|
||||||
|
|
||||||
# A regular expression matching the name of dummy variables (i.e. expectedly
|
|
||||||
# not used).
|
|
||||||
dummy-variables-rgx=^\*{0,2}(_$|unused_|dummy_)
|
|
||||||
|
|
||||||
# List of additional names supposed to be defined in builtins. Remember that
|
|
||||||
# you should avoid to define new builtins when possible.
|
|
||||||
additional-builtins=
|
|
||||||
|
|
||||||
# List of strings which can identify a callback function by name. A callback
|
|
||||||
# name must start or end with one of those strings.
|
|
||||||
callbacks=cb_,_cb
|
|
||||||
|
|
||||||
# List of qualified module names which can have objects that can redefine
|
|
||||||
# builtins.
|
|
||||||
redefining-builtins-modules=six,six.moves,past.builtins,future.builtins,functools
|
|
||||||
|
|
||||||
|
|
||||||
[LOGGING]
|
|
||||||
|
|
||||||
# Logging modules to check that the string format arguments are in logging
|
|
||||||
# function parameter format
|
|
||||||
logging-modules=logging,absl.logging,tensorflow.io.logging
|
|
||||||
|
|
||||||
|
|
||||||
[SIMILARITIES]
|
|
||||||
|
|
||||||
# Minimum lines number of a similarity.
|
|
||||||
min-similarity-lines=4
|
|
||||||
|
|
||||||
# Ignore comments when computing similarities.
|
|
||||||
ignore-comments=yes
|
|
||||||
|
|
||||||
# Ignore docstrings when computing similarities.
|
|
||||||
ignore-docstrings=yes
|
|
||||||
|
|
||||||
# Ignore imports when computing similarities.
|
|
||||||
ignore-imports=no
|
|
||||||
|
|
||||||
|
|
||||||
[SPELLING]
|
|
||||||
|
|
||||||
# Spelling dictionary name. Available dictionaries: none. To make it working
|
|
||||||
# install python-enchant package.
|
|
||||||
spelling-dict=
|
|
||||||
|
|
||||||
# List of comma separated words that should not be checked.
|
|
||||||
spelling-ignore-words=
|
|
||||||
|
|
||||||
# A path to a file that contains private dictionary; one word per line.
|
|
||||||
spelling-private-dict-file=
|
|
||||||
|
|
||||||
# Tells whether to store unknown words to indicated private dictionary in
|
|
||||||
# --spelling-private-dict-file option instead of raising a message.
|
|
||||||
spelling-store-unknown-words=no
|
|
||||||
|
|
||||||
|
|
||||||
[IMPORTS]
|
|
||||||
|
|
||||||
# Deprecated modules which should not be used, separated by a comma
|
|
||||||
deprecated-modules=regsub,
|
|
||||||
TERMIOS,
|
|
||||||
Bastion,
|
|
||||||
rexec,
|
|
||||||
sets
|
|
||||||
|
|
||||||
# Create a graph of every (i.e. internal and external) dependencies in the
|
|
||||||
# given file (report RP0402 must not be disabled)
|
|
||||||
import-graph=
|
|
||||||
|
|
||||||
# Create a graph of external dependencies in the given file (report RP0402 must
|
|
||||||
# not be disabled)
|
|
||||||
ext-import-graph=
|
|
||||||
|
|
||||||
# Create a graph of internal dependencies in the given file (report RP0402 must
|
|
||||||
# not be disabled)
|
|
||||||
int-import-graph=
|
|
||||||
|
|
||||||
# Force import order to recognize a module as part of the standard
|
|
||||||
# compatibility libraries.
|
|
||||||
known-standard-library=
|
|
||||||
|
|
||||||
# Force import order to recognize a module as part of a third party library.
|
|
||||||
known-third-party=enchant, absl
|
|
||||||
|
|
||||||
# Analyse import fallback blocks. This can be used to support both Python 2 and
|
|
||||||
# 3 compatible code, which means that the block might have code that exists
|
|
||||||
# only in one or another interpreter, leading to false positives when analysed.
|
|
||||||
analyse-fallback-blocks=no
|
|
||||||
|
|
||||||
|
|
||||||
[CLASSES]
|
|
||||||
|
|
||||||
# List of method names used to declare (i.e. assign) instance attributes.
|
|
||||||
defining-attr-methods=__init__,
|
|
||||||
__new__,
|
|
||||||
setUp
|
|
||||||
|
|
||||||
# List of member names, which should be excluded from the protected access
|
|
||||||
# warning.
|
|
||||||
exclude-protected=_asdict,
|
|
||||||
_fields,
|
|
||||||
_replace,
|
|
||||||
_source,
|
|
||||||
_make
|
|
||||||
|
|
||||||
# List of valid names for the first argument in a class method.
|
|
||||||
valid-classmethod-first-arg=cls,
|
|
||||||
class_
|
|
||||||
|
|
||||||
# List of valid names for the first argument in a metaclass class method.
|
|
||||||
valid-metaclass-classmethod-first-arg=mcs
|
|
||||||
|
|
||||||
|
|
||||||
[EXCEPTIONS]
|
|
||||||
|
|
||||||
# Exceptions that will emit a warning when being caught. Defaults to
|
|
||||||
# "Exception"
|
|
||||||
overgeneral-exceptions=StandardError,
|
|
||||||
Exception,
|
|
||||||
BaseException
|
|
||||||
1
.yapfignore
Normal file
1
.yapfignore
Normal file
@ -0,0 +1 @@
|
|||||||
|
collect_env.py
|
||||||
315
CMakeLists.txt
Normal file
315
CMakeLists.txt
Normal file
@ -0,0 +1,315 @@
|
|||||||
|
cmake_minimum_required(VERSION 3.21)
|
||||||
|
|
||||||
|
project(vllm_extensions LANGUAGES CXX)
|
||||||
|
|
||||||
|
option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda")
|
||||||
|
|
||||||
|
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
|
||||||
|
message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
|
||||||
|
|
||||||
|
include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
|
||||||
|
|
||||||
|
#
|
||||||
|
# Supported python versions. These versions will be searched in order, the
|
||||||
|
# first match will be selected. These should be kept in sync with setup.py.
|
||||||
|
#
|
||||||
|
set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
|
||||||
|
|
||||||
|
# Supported NVIDIA architectures.
|
||||||
|
set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
|
||||||
|
|
||||||
|
# Supported AMD GPU architectures.
|
||||||
|
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100")
|
||||||
|
|
||||||
|
#
|
||||||
|
# Supported/expected torch versions for CUDA/ROCm.
|
||||||
|
#
|
||||||
|
# Currently, having an incorrect pytorch version results in a warning
|
||||||
|
# rather than an error.
|
||||||
|
#
|
||||||
|
# Note: the CUDA torch version is derived from pyproject.toml and various
|
||||||
|
# requirements.txt files and should be kept consistent. The ROCm torch
|
||||||
|
# versions are derived from Dockerfile.rocm
|
||||||
|
#
|
||||||
|
set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
|
||||||
|
set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
|
||||||
|
set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
|
||||||
|
|
||||||
|
#
|
||||||
|
# Try to find python package with an executable that exactly matches
|
||||||
|
# `VLLM_PYTHON_EXECUTABLE` and is one of the supported versions.
|
||||||
|
#
|
||||||
|
if (VLLM_PYTHON_EXECUTABLE)
|
||||||
|
find_python_from_executable(${VLLM_PYTHON_EXECUTABLE} "${PYTHON_SUPPORTED_VERSIONS}")
|
||||||
|
else()
|
||||||
|
message(FATAL_ERROR
|
||||||
|
"Please set VLLM_PYTHON_EXECUTABLE to the path of the desired python version"
|
||||||
|
" before running cmake configure.")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Update cmake's `CMAKE_PREFIX_PATH` with torch location.
|
||||||
|
#
|
||||||
|
append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
|
||||||
|
|
||||||
|
# Ensure the 'nvcc' command is in the PATH
|
||||||
|
find_program(NVCC_EXECUTABLE nvcc)
|
||||||
|
if (CUDA_FOUND AND NOT NVCC_EXECUTABLE)
|
||||||
|
message(FATAL_ERROR "nvcc not found")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Import torch cmake configuration.
|
||||||
|
# Torch also imports CUDA (and partially HIP) languages with some customizations,
|
||||||
|
# so there is no need to do this explicitly with check_language/enable_language,
|
||||||
|
# etc.
|
||||||
|
#
|
||||||
|
find_package(Torch REQUIRED)
|
||||||
|
|
||||||
|
#
|
||||||
|
# Forward the non-CUDA device extensions to external CMake scripts.
|
||||||
|
#
|
||||||
|
if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
|
||||||
|
NOT VLLM_TARGET_DEVICE STREQUAL "rocm")
|
||||||
|
if (VLLM_TARGET_DEVICE STREQUAL "cpu")
|
||||||
|
include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
|
||||||
|
else()
|
||||||
|
message(FATAL_ERROR "Unsupported vLLM target device: ${VLLM_TARGET_DEVICE}")
|
||||||
|
endif()
|
||||||
|
return()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Set up GPU language and check the torch version and warn if it isn't
|
||||||
|
# what is expected.
|
||||||
|
#
|
||||||
|
if (NOT HIP_FOUND AND CUDA_FOUND)
|
||||||
|
set(VLLM_GPU_LANG "CUDA")
|
||||||
|
|
||||||
|
if (NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_CUDA})
|
||||||
|
message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_CUDA} "
|
||||||
|
"expected for CUDA build, saw ${Torch_VERSION} instead.")
|
||||||
|
endif()
|
||||||
|
elseif(HIP_FOUND)
|
||||||
|
set(VLLM_GPU_LANG "HIP")
|
||||||
|
|
||||||
|
# Importing torch recognizes and sets up some HIP/ROCm configuration but does
|
||||||
|
# not let cmake recognize .hip files. In order to get cmake to understand the
|
||||||
|
# .hip extension automatically, HIP must be enabled explicitly.
|
||||||
|
enable_language(HIP)
|
||||||
|
|
||||||
|
# ROCm 5.x
|
||||||
|
if (ROCM_VERSION_DEV_MAJOR EQUAL 5 AND
|
||||||
|
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_5X})
|
||||||
|
message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_5X} "
|
||||||
|
"expected for ROCMm 5.x build, saw ${Torch_VERSION} instead.")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# ROCm 6.x
|
||||||
|
if (ROCM_VERSION_DEV_MAJOR EQUAL 6 AND
|
||||||
|
NOT Torch_VERSION VERSION_EQUAL ${TORCH_SUPPORTED_VERSION_ROCM_6X})
|
||||||
|
message(WARNING "Pytorch version ${TORCH_SUPPORTED_VERSION_ROCM_6X} "
|
||||||
|
"expected for ROCMm 6.x build, saw ${Torch_VERSION} instead.")
|
||||||
|
endif()
|
||||||
|
else()
|
||||||
|
message(FATAL_ERROR "Can't find CUDA or HIP installation.")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Override the GPU architectures detected by cmake/torch and filter them by
|
||||||
|
# the supported versions for the current language.
|
||||||
|
# The final set of arches is stored in `VLLM_GPU_ARCHES`.
|
||||||
|
#
|
||||||
|
override_gpu_arches(VLLM_GPU_ARCHES
|
||||||
|
${VLLM_GPU_LANG}
|
||||||
|
"${${VLLM_GPU_LANG}_SUPPORTED_ARCHS}")
|
||||||
|
|
||||||
|
#
|
||||||
|
# Query torch for additional GPU compilation flags for the given
|
||||||
|
# `VLLM_GPU_LANG`.
|
||||||
|
# The final set of arches is stored in `VLLM_GPU_FLAGS`.
|
||||||
|
#
|
||||||
|
get_torch_gpu_compiler_flags(VLLM_GPU_FLAGS ${VLLM_GPU_LANG})
|
||||||
|
|
||||||
|
#
|
||||||
|
# Set nvcc parallelism.
|
||||||
|
#
|
||||||
|
if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
|
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Define extension targets
|
||||||
|
#
|
||||||
|
|
||||||
|
#
|
||||||
|
# _C extension
|
||||||
|
#
|
||||||
|
|
||||||
|
set(VLLM_EXT_SRC
|
||||||
|
"csrc/cache_kernels.cu"
|
||||||
|
"csrc/attention/attention_kernels.cu"
|
||||||
|
"csrc/pos_encoding_kernels.cu"
|
||||||
|
"csrc/activation_kernels.cu"
|
||||||
|
"csrc/layernorm_kernels.cu"
|
||||||
|
"csrc/quantization/squeezellm/quant_cuda_kernel.cu"
|
||||||
|
"csrc/quantization/gptq/q_gemm.cu"
|
||||||
|
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
||||||
|
"csrc/quantization/fp8/common.cu"
|
||||||
|
"csrc/cuda_utils_kernels.cu"
|
||||||
|
"csrc/moe_align_block_size_kernels.cu"
|
||||||
|
"csrc/torch_bindings.cpp")
|
||||||
|
|
||||||
|
if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||||
|
include(FetchContent)
|
||||||
|
SET(CUTLASS_ENABLE_HEADERS_ONLY=ON)
|
||||||
|
FetchContent_Declare(
|
||||||
|
cutlass
|
||||||
|
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
|
||||||
|
# CUTLASS 3.5.0
|
||||||
|
GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc
|
||||||
|
)
|
||||||
|
FetchContent_MakeAvailable(cutlass)
|
||||||
|
|
||||||
|
list(APPEND VLLM_EXT_SRC
|
||||||
|
"csrc/quantization/aqlm/gemm_kernels.cu"
|
||||||
|
"csrc/quantization/awq/gemm_kernels.cu"
|
||||||
|
"csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
|
||||||
|
"csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
|
||||||
|
"csrc/quantization/gptq_marlin/gptq_marlin.cu"
|
||||||
|
"csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
|
||||||
|
"csrc/custom_all_reduce.cu"
|
||||||
|
"csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu"
|
||||||
|
"csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu"
|
||||||
|
"csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu")
|
||||||
|
|
||||||
|
#
|
||||||
|
# The CUTLASS kernels for Hopper require sm90a to be enabled.
|
||||||
|
# This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
|
||||||
|
# That adds an extra 17MB to compiled binary, so instead we selectively enable it.
|
||||||
|
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
|
||||||
|
set_source_files_properties(
|
||||||
|
"csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu"
|
||||||
|
PROPERTIES
|
||||||
|
COMPILE_FLAGS
|
||||||
|
"-gencode arch=compute_90a,code=sm_90a")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
endif()
|
||||||
|
|
||||||
|
define_gpu_extension_target(
|
||||||
|
_C
|
||||||
|
DESTINATION vllm
|
||||||
|
LANGUAGE ${VLLM_GPU_LANG}
|
||||||
|
SOURCES ${VLLM_EXT_SRC}
|
||||||
|
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
||||||
|
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
||||||
|
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
|
||||||
|
USE_SABI 3
|
||||||
|
WITH_SOABI)
|
||||||
|
|
||||||
|
#
|
||||||
|
# _moe_C extension
|
||||||
|
#
|
||||||
|
|
||||||
|
set(VLLM_MOE_EXT_SRC
|
||||||
|
"csrc/moe/torch_bindings.cpp"
|
||||||
|
"csrc/moe/topk_softmax_kernels.cu")
|
||||||
|
|
||||||
|
define_gpu_extension_target(
|
||||||
|
_moe_C
|
||||||
|
DESTINATION vllm
|
||||||
|
LANGUAGE ${VLLM_GPU_LANG}
|
||||||
|
SOURCES ${VLLM_MOE_EXT_SRC}
|
||||||
|
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
|
||||||
|
ARCHITECTURES ${VLLM_GPU_ARCHES}
|
||||||
|
USE_SABI 3
|
||||||
|
WITH_SOABI)
|
||||||
|
|
||||||
|
#
|
||||||
|
# _punica_C extension
|
||||||
|
#
|
||||||
|
|
||||||
|
set(VLLM_PUNICA_EXT_SRC
|
||||||
|
"csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
|
||||||
|
"csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
|
||||||
|
"csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
|
||||||
|
"csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
|
||||||
|
"csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
|
||||||
|
"csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
|
||||||
|
"csrc/punica/punica_ops.cu"
|
||||||
|
"csrc/punica/torch_bindings.cpp")
|
||||||
|
|
||||||
|
#
|
||||||
|
# Copy GPU compilation flags+update for punica
|
||||||
|
#
|
||||||
|
set(VLLM_PUNICA_GPU_FLAGS ${VLLM_GPU_FLAGS})
|
||||||
|
list(REMOVE_ITEM VLLM_PUNICA_GPU_FLAGS
|
||||||
|
"-D__CUDA_NO_HALF_OPERATORS__"
|
||||||
|
"-D__CUDA_NO_HALF_CONVERSIONS__"
|
||||||
|
"-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
|
||||||
|
"-D__CUDA_NO_HALF2_OPERATORS__")
|
||||||
|
|
||||||
|
#
|
||||||
|
# Filter out CUDA architectures < 8.0 for punica.
|
||||||
|
#
|
||||||
|
if (${VLLM_GPU_LANG} STREQUAL "CUDA")
|
||||||
|
set(VLLM_PUNICA_GPU_ARCHES)
|
||||||
|
foreach(ARCH ${VLLM_GPU_ARCHES})
|
||||||
|
string_to_ver(CODE_VER ${ARCH})
|
||||||
|
if (CODE_VER GREATER_EQUAL 8.0)
|
||||||
|
list(APPEND VLLM_PUNICA_GPU_ARCHES ${ARCH})
|
||||||
|
endif()
|
||||||
|
endforeach()
|
||||||
|
message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
|
||||||
|
elseif(${VLLM_GPU_LANG} STREQUAL "HIP")
|
||||||
|
set(VLLM_PUNICA_GPU_ARCHES ${VLLM_GPU_ARCHES})
|
||||||
|
message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (VLLM_PUNICA_GPU_ARCHES)
|
||||||
|
define_gpu_extension_target(
|
||||||
|
_punica_C
|
||||||
|
DESTINATION vllm
|
||||||
|
LANGUAGE ${VLLM_GPU_LANG}
|
||||||
|
SOURCES ${VLLM_PUNICA_EXT_SRC}
|
||||||
|
COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
|
||||||
|
ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
|
||||||
|
USE_SABI 3
|
||||||
|
WITH_SOABI)
|
||||||
|
else()
|
||||||
|
message(WARNING "Unable to create _punica_C target because none of the "
|
||||||
|
"requested architectures (${VLLM_GPU_ARCHES}) are supported, i.e. >= 8.0")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Add the `default` target which detects which extensions should be
|
||||||
|
# built based on platform/architecture. This is the same logic that
|
||||||
|
# setup.py uses to select which extensions should be built and should
|
||||||
|
# be kept in sync.
|
||||||
|
#
|
||||||
|
# The `default` target makes direct use of cmake easier since knowledge
|
||||||
|
# of which extensions are supported has been factored in, e.g.
|
||||||
|
#
|
||||||
|
# mkdir build && cd build
|
||||||
|
# cmake -G Ninja -DVLLM_PYTHON_EXECUTABLE=`which python3` -DCMAKE_LIBRARY_OUTPUT_DIRECTORY=../vllm ..
|
||||||
|
# cmake --build . --target default
|
||||||
|
#
|
||||||
|
add_custom_target(default)
|
||||||
|
|
||||||
|
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
|
||||||
|
message(STATUS "Enabling C extension.")
|
||||||
|
add_dependencies(default _C)
|
||||||
|
|
||||||
|
message(STATUS "Enabling moe extension.")
|
||||||
|
add_dependencies(default _moe_C)
|
||||||
|
|
||||||
|
# Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
|
||||||
|
# VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
|
||||||
|
# there are supported target arches.
|
||||||
|
if (VLLM_PUNICA_GPU_ARCHES AND
|
||||||
|
(ENV{VLLM_INSTALL_PUNICA_KERNELS} OR VLLM_INSTALL_PUNICA_KERNELS))
|
||||||
|
message(STATUS "Enabling punica extension.")
|
||||||
|
add_dependencies(default _punica_C)
|
||||||
|
endif()
|
||||||
|
endif()
|
||||||
@ -21,7 +21,6 @@ Express your support on Twitter if vLLM aids you, or simply offer your appreciat
|
|||||||
### Build from source
|
### Build from source
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
pip install -r requirements.txt
|
|
||||||
pip install -e . # This may take several minutes.
|
pip install -e . # This may take several minutes.
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -30,6 +29,8 @@ pip install -e . # This may take several minutes.
|
|||||||
```bash
|
```bash
|
||||||
pip install -r requirements-dev.txt
|
pip install -r requirements-dev.txt
|
||||||
|
|
||||||
|
# linting and formatting
|
||||||
|
bash format.sh
|
||||||
# Static type checking
|
# Static type checking
|
||||||
mypy
|
mypy
|
||||||
# Unit tests
|
# Unit tests
|
||||||
@ -45,31 +46,9 @@ pytest tests/
|
|||||||
If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it.
|
If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it.
|
||||||
If not, please file a new issue, providing as much relevant information as possible.
|
If not, please file a new issue, providing as much relevant information as possible.
|
||||||
|
|
||||||
### Coding Style Guide
|
### Pull Requests & Code Reviews
|
||||||
|
|
||||||
In general, we adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html).
|
Please check the PR checklist in the [PR template](.github/PULL_REQUEST_TEMPLATE.md) for detailed guide for contribution.
|
||||||
|
|
||||||
We include a formatting script [`format.sh`](./format.sh) to format the code.
|
|
||||||
|
|
||||||
### Pull Requests
|
|
||||||
|
|
||||||
When submitting a pull request:
|
|
||||||
|
|
||||||
1. Make sure your code has been rebased on top of the latest commit on the main branch.
|
|
||||||
2. Ensure code is properly formatted by running [`format.sh`](./format.sh).
|
|
||||||
3. Include a detailed description of the changes in the pull request.
|
|
||||||
Explain why you made the changes you did.
|
|
||||||
If your pull request fixes an open issue, please include a reference to it in the description.
|
|
||||||
|
|
||||||
### Code Reviews
|
|
||||||
|
|
||||||
All submissions, including submissions by project members, require a code review.
|
|
||||||
To make the review process as smooth as possible, please:
|
|
||||||
|
|
||||||
1. Keep your changes as concise as possible.
|
|
||||||
If your pull request involves multiple unrelated changes, consider splitting it into separate pull requests.
|
|
||||||
2. Respond to all comments within a reasonable time frame.
|
|
||||||
If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.
|
|
||||||
|
|
||||||
### Thank You
|
### Thank You
|
||||||
|
|
||||||
|
|||||||
132
Dockerfile
132
Dockerfile
@ -1,72 +1,136 @@
|
|||||||
FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
|
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
|
||||||
|
# to run the OpenAI compatible server.
|
||||||
|
|
||||||
|
# Please update any changes made here to
|
||||||
|
# docs/source/dev/dockerfile/dockerfile.rst and
|
||||||
|
# docs/source/assets/dev/dockerfile-stages-dependency.png
|
||||||
|
|
||||||
|
#################### BASE BUILD IMAGE ####################
|
||||||
|
# prepare basic build environment
|
||||||
|
FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev
|
||||||
|
|
||||||
RUN apt-get update -y \
|
RUN apt-get update -y \
|
||||||
&& apt-get install -y python3-pip
|
&& apt-get install -y python3-pip git
|
||||||
|
|
||||||
|
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||||
|
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||||
|
# this won't be needed for future versions of this docker image
|
||||||
|
# or future versions of triton.
|
||||||
|
RUN ldconfig /usr/local/cuda-12.4/compat/
|
||||||
|
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
# install build and runtime dependencies
|
# install build and runtime dependencies
|
||||||
COPY requirements.txt requirements.txt
|
COPY requirements-common.txt requirements-common.txt
|
||||||
|
COPY requirements-cuda.txt requirements-cuda.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install -r requirements.txt
|
pip install -r requirements-cuda.txt
|
||||||
|
|
||||||
# install development dependencies
|
# install development dependencies
|
||||||
COPY requirements-dev.txt requirements-dev.txt
|
COPY requirements-dev.txt requirements-dev.txt
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install -r requirements-dev.txt
|
pip install -r requirements-dev.txt
|
||||||
|
|
||||||
# image to build pytorch extensions
|
# cuda arch list used by torch
|
||||||
|
# can be useful for both `dev` and `test`
|
||||||
|
# explicitly set the list to avoid issues with torch 2.2
|
||||||
|
# see https://github.com/pytorch/pytorch/pull/123243
|
||||||
|
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
|
||||||
|
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
||||||
|
#################### BASE BUILD IMAGE ####################
|
||||||
|
|
||||||
|
|
||||||
|
#################### WHEEL BUILD IMAGE ####################
|
||||||
FROM dev AS build
|
FROM dev AS build
|
||||||
|
|
||||||
# copy input files
|
# install build dependencies
|
||||||
|
COPY requirements-build.txt requirements-build.txt
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install -r requirements-build.txt
|
||||||
|
|
||||||
|
# install compiler cache to speed up compilation leveraging local or remote caching
|
||||||
|
RUN apt-get update -y && apt-get install -y ccache
|
||||||
|
|
||||||
|
# files and directories related to build wheels
|
||||||
COPY csrc csrc
|
COPY csrc csrc
|
||||||
COPY setup.py setup.py
|
COPY setup.py setup.py
|
||||||
COPY requirements.txt requirements.txt
|
COPY cmake cmake
|
||||||
|
COPY CMakeLists.txt CMakeLists.txt
|
||||||
|
COPY requirements-common.txt requirements-common.txt
|
||||||
|
COPY requirements-cuda.txt requirements-cuda.txt
|
||||||
COPY pyproject.toml pyproject.toml
|
COPY pyproject.toml pyproject.toml
|
||||||
COPY vllm/__init__.py vllm/__init__.py
|
COPY vllm vllm
|
||||||
|
|
||||||
# max jobs used by Ninja to build extensions
|
# max jobs used by Ninja to build extensions
|
||||||
ENV MAX_JOBS=$max_jobs
|
ARG max_jobs=2
|
||||||
RUN python3 setup.py build_ext --inplace
|
ENV MAX_JOBS=${max_jobs}
|
||||||
|
# number of threads used by nvcc
|
||||||
|
ARG nvcc_threads=8
|
||||||
|
ENV NVCC_THREADS=$nvcc_threads
|
||||||
|
# make sure punica kernels are built (for LoRA)
|
||||||
|
ENV VLLM_INSTALL_PUNICA_KERNELS=1
|
||||||
|
|
||||||
# image to run unit testing suite
|
ENV CCACHE_DIR=/root/.cache/ccache
|
||||||
FROM dev AS test
|
RUN --mount=type=cache,target=/root/.cache/ccache \
|
||||||
|
--mount=type=cache,target=/root/.cache/pip \
|
||||||
|
python3 setup.py bdist_wheel --dist-dir=dist
|
||||||
|
|
||||||
# copy pytorch extensions separately to avoid having to rebuild
|
# check the size of the wheel, we cannot upload wheels larger than 100MB
|
||||||
# when python code changes
|
COPY .buildkite/check-wheel-size.py check-wheel-size.py
|
||||||
COPY --from=build /workspace/vllm/*.so /workspace/vllm/
|
RUN python3 check-wheel-size.py dist
|
||||||
COPY tests tests
|
|
||||||
COPY vllm vllm
|
|
||||||
|
|
||||||
ENTRYPOINT ["python3", "-m", "pytest", "tests"]
|
#################### EXTENSION Build IMAGE ####################
|
||||||
|
|
||||||
# use CUDA base as CUDA runtime dependencies are already installed via pip
|
#################### vLLM installation IMAGE ####################
|
||||||
FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
|
# image with vLLM installed
|
||||||
|
FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base
|
||||||
|
WORKDIR /vllm-workspace
|
||||||
|
|
||||||
# libnccl required for ray
|
|
||||||
RUN apt-get update -y \
|
RUN apt-get update -y \
|
||||||
&& apt-get install -y python3-pip
|
&& apt-get install -y python3-pip git vim
|
||||||
|
|
||||||
WORKDIR /workspace
|
# Workaround for https://github.com/openai/triton/issues/2507 and
|
||||||
COPY requirements.txt requirements.txt
|
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
|
||||||
|
# this won't be needed for future versions of this docker image
|
||||||
|
# or future versions of triton.
|
||||||
|
RUN ldconfig /usr/local/cuda-12.4/compat/
|
||||||
|
|
||||||
|
# install vllm wheel first, so that torch etc will be installed
|
||||||
|
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
|
||||||
|
--mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install dist/*.whl --verbose
|
||||||
|
#################### vLLM installation IMAGE ####################
|
||||||
|
|
||||||
|
|
||||||
|
#################### TEST IMAGE ####################
|
||||||
|
# image to run unit testing suite
|
||||||
|
# note that this uses vllm installed by `pip`
|
||||||
|
FROM vllm-base AS test
|
||||||
|
|
||||||
|
ADD . /vllm-workspace/
|
||||||
|
|
||||||
|
# install development dependencies (for testing)
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install -r requirements.txt
|
pip install -r requirements-dev.txt
|
||||||
|
|
||||||
FROM vllm-base AS vllm
|
# doc requires source code
|
||||||
COPY --from=build /workspace/vllm/*.so /workspace/vllm/
|
# we hide them inside `test_docs/` , so that this source code
|
||||||
COPY vllm vllm
|
# will not be imported by other tests
|
||||||
|
RUN mkdir test_docs
|
||||||
|
RUN mv docs test_docs/
|
||||||
|
RUN mv vllm test_docs/
|
||||||
|
|
||||||
EXPOSE 8000
|
#################### TEST IMAGE ####################
|
||||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.api_server"]
|
|
||||||
|
|
||||||
|
#################### OPENAI API SERVER ####################
|
||||||
# openai api server alternative
|
# openai api server alternative
|
||||||
FROM vllm-base AS vllm-openai
|
FROM vllm-base AS vllm-openai
|
||||||
|
|
||||||
# install additional dependencies for openai api server
|
# install additional dependencies for openai api server
|
||||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
pip install accelerate fschat
|
pip install accelerate hf_transfer modelscope
|
||||||
|
|
||||||
COPY --from=build /workspace/vllm/*.so /workspace/vllm/
|
ENV VLLM_USAGE_SOURCE production-docker-image
|
||||||
COPY vllm vllm
|
|
||||||
|
|
||||||
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
|
||||||
|
#################### OPENAI API SERVER ####################
|
||||||
|
|||||||
26
Dockerfile.cpu
Normal file
26
Dockerfile.cpu
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
|
||||||
|
|
||||||
|
FROM ubuntu:22.04 AS cpu-test-1
|
||||||
|
|
||||||
|
RUN apt-get update -y \
|
||||||
|
&& apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
|
||||||
|
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip \
|
||||||
|
&& pip install wheel packaging ninja "setuptools>=49.4.0" numpy
|
||||||
|
|
||||||
|
FROM cpu-test-1 AS build
|
||||||
|
|
||||||
|
COPY ./ /workspace/vllm
|
||||||
|
|
||||||
|
WORKDIR /workspace/vllm
|
||||||
|
|
||||||
|
RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
||||||
|
|
||||||
|
RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
|
||||||
|
|
||||||
|
WORKDIR /workspace/
|
||||||
|
|
||||||
|
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
|
||||||
|
|
||||||
|
CMD ["/bin/bash"]
|
||||||
36
Dockerfile.neuron
Normal file
36
Dockerfile.neuron
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
# default base image
|
||||||
|
ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04"
|
||||||
|
|
||||||
|
FROM $BASE_IMAGE
|
||||||
|
|
||||||
|
RUN echo "Base image is $BASE_IMAGE"
|
||||||
|
|
||||||
|
# Install some basic utilities
|
||||||
|
RUN apt-get update && apt-get install python3 python3-pip -y
|
||||||
|
|
||||||
|
### Mount Point ###
|
||||||
|
# When launching the container, mount the code directory to /app
|
||||||
|
ARG APP_MOUNT=/app
|
||||||
|
VOLUME [ ${APP_MOUNT} ]
|
||||||
|
WORKDIR ${APP_MOUNT}
|
||||||
|
|
||||||
|
RUN python3 -m pip install --upgrade pip
|
||||||
|
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
|
||||||
|
RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
|
||||||
|
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||||
|
RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
|
||||||
|
|
||||||
|
COPY ./vllm /app/vllm/vllm
|
||||||
|
COPY ./setup.py /app/vllm/setup.py
|
||||||
|
COPY ./requirements-common.txt /app/vllm/requirements-common.txt
|
||||||
|
COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
|
||||||
|
|
||||||
|
RUN cd /app/vllm \
|
||||||
|
&& python3 -m pip install -U -r requirements-neuron.txt
|
||||||
|
|
||||||
|
ENV VLLM_TARGET_DEVICE neuron
|
||||||
|
RUN cd /app/vllm \
|
||||||
|
&& pip install -e . \
|
||||||
|
&& cd ..
|
||||||
|
|
||||||
|
CMD ["/bin/bash"]
|
||||||
115
Dockerfile.rocm
Normal file
115
Dockerfile.rocm
Normal file
@ -0,0 +1,115 @@
|
|||||||
|
# default base image
|
||||||
|
ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
|
||||||
|
|
||||||
|
FROM $BASE_IMAGE
|
||||||
|
|
||||||
|
ARG BASE_IMAGE="rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
|
||||||
|
|
||||||
|
RUN echo "Base image is $BASE_IMAGE"
|
||||||
|
|
||||||
|
# BASE_IMAGE for ROCm_5.7: "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1"
|
||||||
|
# BASE_IMAGE for ROCm_6.0: "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1"
|
||||||
|
|
||||||
|
|
||||||
|
ARG FA_GFX_ARCHS="gfx90a;gfx942"
|
||||||
|
RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
|
||||||
|
|
||||||
|
ARG FA_BRANCH="ae7928c"
|
||||||
|
RUN echo "FA_BRANCH is $FA_BRANCH"
|
||||||
|
|
||||||
|
# whether to build flash-attention
|
||||||
|
# if 0, will not build flash attention
|
||||||
|
# this is useful for gfx target where flash-attention is not supported
|
||||||
|
# In that case, we need to use the python reference attention implementation in vllm
|
||||||
|
ARG BUILD_FA="1"
|
||||||
|
|
||||||
|
# whether to build triton on rocm
|
||||||
|
ARG BUILD_TRITON="1"
|
||||||
|
|
||||||
|
# Install some basic utilities
|
||||||
|
RUN apt-get update && apt-get install python3 python3-pip -y
|
||||||
|
|
||||||
|
# Install some basic utilities
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
curl \
|
||||||
|
ca-certificates \
|
||||||
|
sudo \
|
||||||
|
git \
|
||||||
|
bzip2 \
|
||||||
|
libx11-6 \
|
||||||
|
build-essential \
|
||||||
|
wget \
|
||||||
|
unzip \
|
||||||
|
nvidia-cuda-toolkit \
|
||||||
|
tmux \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
### Mount Point ###
|
||||||
|
# When launching the container, mount the code directory to /app
|
||||||
|
ARG APP_MOUNT=/vllm-workspace
|
||||||
|
VOLUME [ ${APP_MOUNT} ]
|
||||||
|
WORKDIR ${APP_MOUNT}
|
||||||
|
|
||||||
|
RUN python3 -m pip install --upgrade pip
|
||||||
|
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
|
||||||
|
|
||||||
|
ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
|
||||||
|
ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
|
||||||
|
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
|
||||||
|
ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
|
||||||
|
|
||||||
|
# Install ROCm flash-attention
|
||||||
|
RUN if [ "$BUILD_FA" = "1" ]; then \
|
||||||
|
mkdir libs \
|
||||||
|
&& cd libs \
|
||||||
|
&& git clone https://github.com/ROCm/flash-attention.git \
|
||||||
|
&& cd flash-attention \
|
||||||
|
&& git checkout ${FA_BRANCH} \
|
||||||
|
&& git submodule update --init \
|
||||||
|
&& export GPU_ARCHS=${FA_GFX_ARCHS} \
|
||||||
|
&& if [ "$BASE_IMAGE" = "rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" ]; then \
|
||||||
|
patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch; fi \
|
||||||
|
&& python3 setup.py install \
|
||||||
|
&& cd ..; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
|
||||||
|
# Manually removed it so that later steps of numpy upgrade can continue
|
||||||
|
RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
|
||||||
|
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
|
||||||
|
|
||||||
|
# build triton
|
||||||
|
RUN if [ "$BUILD_TRITON" = "1" ]; then \
|
||||||
|
mkdir -p libs \
|
||||||
|
&& cd libs \
|
||||||
|
&& pip uninstall -y triton \
|
||||||
|
&& git clone https://github.com/ROCm/triton.git \
|
||||||
|
&& cd triton/python \
|
||||||
|
&& pip3 install . \
|
||||||
|
&& cd ../..; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
WORKDIR /vllm-workspace
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
#RUN python3 -m pip install pynvml # to be removed eventually
|
||||||
|
RUN python3 -m pip install --upgrade pip numba
|
||||||
|
|
||||||
|
# make sure punica kernels are built (for LoRA)
|
||||||
|
ENV VLLM_INSTALL_PUNICA_KERNELS=1
|
||||||
|
# Workaround for ray >= 2.10.0
|
||||||
|
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
|
||||||
|
|
||||||
|
ENV VLLM_NCCL_SO_PATH=/opt/rocm/lib/librccl.so
|
||||||
|
|
||||||
|
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||||
|
pip install -U -r requirements-rocm.txt \
|
||||||
|
&& patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
|
||||||
|
&& python3 setup.py install \
|
||||||
|
&& cp build/lib.linux-x86_64-cpython-39/vllm/_C.abi3.so vllm/ \
|
||||||
|
&& cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.abi3.so vllm/ \
|
||||||
|
&& cp build/lib.linux-x86_64-cpython-39/vllm/_moe_C.abi3.so vllm/ \
|
||||||
|
&& cd ..
|
||||||
|
|
||||||
|
|
||||||
|
CMD ["/bin/bash"]
|
||||||
@ -1,4 +1,10 @@
|
|||||||
include LICENSE
|
include LICENSE
|
||||||
include requirements.txt
|
include requirements-common.txt
|
||||||
|
include requirements-cuda.txt
|
||||||
|
include requirements-rocm.txt
|
||||||
|
include requirements-neuron.txt
|
||||||
|
include requirements-cpu.txt
|
||||||
|
include CMakeLists.txt
|
||||||
|
|
||||||
|
recursive-include cmake *
|
||||||
recursive-include csrc *
|
recursive-include csrc *
|
||||||
|
|||||||
85
README.md
85
README.md
@ -10,13 +10,33 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
</h3>
|
</h3>
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
| <a href="https://vllm.readthedocs.io/en/latest/"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> |
|
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> |
|
||||||
|
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
**Ray Summit CPF is Open (June 4th to June 20th)!**
|
||||||
|
|
||||||
|
There will be a track for vLLM at the Ray Summit (09/30-10/02, SF) this year!
|
||||||
|
If you have cool projects related to vLLM or LLM inference, we would love to see your proposals.
|
||||||
|
This will be a great chance for everyone in the community to get together and learn.
|
||||||
|
Please submit your proposal [here](https://raysummit.anyscale.com/flow/anyscale/raysummit2024/landing/page/eventsite)
|
||||||
|
|
||||||
|
**The Fourth vLLM Bay Area Meetup (June 11th 5:30pm-8pm PT)**
|
||||||
|
|
||||||
|
We are thrilled to announce our fourth vLLM Meetup!
|
||||||
|
The vLLM team will share recent updates and roadmap.
|
||||||
|
We will also have vLLM collaborators from BentoML and Cloudflare coming up to the stage to discuss their experience in deploying LLMs with vLLM.
|
||||||
|
Please register [here](https://lu.ma/agivllm) and join us!
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
*Latest News* 🔥
|
*Latest News* 🔥
|
||||||
|
- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
|
||||||
|
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
|
||||||
|
- [2024/01] Added ROCm 6.0 support to vLLM.
|
||||||
|
- [2023/12] Added ROCm 5.7 support to vLLM.
|
||||||
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
|
- [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
|
||||||
- [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there.
|
- [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there.
|
||||||
- [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
|
- [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
|
||||||
@ -26,7 +46,7 @@ Easy, fast, and cheap LLM serving for everyone
|
|||||||
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
|
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
|
||||||
|
|
||||||
---
|
---
|
||||||
|
## About
|
||||||
vLLM is a fast and easy-to-use library for LLM inference and serving.
|
vLLM is a fast and easy-to-use library for LLM inference and serving.
|
||||||
|
|
||||||
vLLM is fast with:
|
vLLM is fast with:
|
||||||
@ -34,6 +54,8 @@ vLLM is fast with:
|
|||||||
- State-of-the-art serving throughput
|
- State-of-the-art serving throughput
|
||||||
- Efficient management of attention key and value memory with **PagedAttention**
|
- Efficient management of attention key and value memory with **PagedAttention**
|
||||||
- Continuous batching of incoming requests
|
- Continuous batching of incoming requests
|
||||||
|
- Fast model execution with CUDA/HIP graph
|
||||||
|
- Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
|
||||||
- Optimized CUDA kernels
|
- Optimized CUDA kernels
|
||||||
|
|
||||||
vLLM is flexible and easy to use with:
|
vLLM is flexible and easy to use with:
|
||||||
@ -43,26 +65,18 @@ vLLM is flexible and easy to use with:
|
|||||||
- Tensor parallelism support for distributed inference
|
- Tensor parallelism support for distributed inference
|
||||||
- Streaming outputs
|
- Streaming outputs
|
||||||
- OpenAI-compatible API server
|
- OpenAI-compatible API server
|
||||||
|
- Support NVIDIA GPUs and AMD GPUs
|
||||||
|
- (Experimental) Prefix caching support
|
||||||
|
- (Experimental) Multi-lora support
|
||||||
|
|
||||||
vLLM seamlessly supports many Hugging Face models, including the following architectures:
|
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
|
||||||
|
- Transformer-like LLMs (e.g., Llama)
|
||||||
|
- Mixture-of-Expert LLMs (e.g., Mixtral)
|
||||||
|
- Multi-modal LLMs (e.g., LLaVA)
|
||||||
|
|
||||||
- Aquila & Aquila2 (`BAAI/AquilaChat2-7B`, `BAAI/AquilaChat2-34B`, `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.)
|
Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
|
||||||
- Baichuan (`baichuan-inc/Baichuan-7B`, `baichuan-inc/Baichuan-13B-Chat`, etc.)
|
|
||||||
- BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
|
## Getting Started
|
||||||
- ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
|
|
||||||
- Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
|
|
||||||
- GPT-2 (`gpt2`, `gpt2-xl`, etc.)
|
|
||||||
- GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
|
|
||||||
- GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
|
|
||||||
- GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
|
|
||||||
- InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
|
|
||||||
- LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
|
|
||||||
- Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
|
|
||||||
- MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
|
|
||||||
- OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
|
|
||||||
- Phi-1.5 (`microsoft/phi-1_5`, etc.)
|
|
||||||
- Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
|
|
||||||
- Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
|
|
||||||
|
|
||||||
Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
|
Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
|
||||||
|
|
||||||
@ -70,9 +84,7 @@ Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/get
|
|||||||
pip install vllm
|
pip install vllm
|
||||||
```
|
```
|
||||||
|
|
||||||
## Getting Started
|
Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
|
||||||
|
|
||||||
Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started.
|
|
||||||
- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
|
- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
|
||||||
- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
|
- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
|
||||||
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
|
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
|
||||||
@ -82,6 +94,33 @@ Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started
|
|||||||
We welcome and value any contributions and collaborations.
|
We welcome and value any contributions and collaborations.
|
||||||
Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
|
Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
|
||||||
|
|
||||||
|
## Sponsors
|
||||||
|
|
||||||
|
vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
|
||||||
|
|
||||||
|
<!-- Note: Please sort them in alphabetical order. -->
|
||||||
|
<!-- Note: Please keep these consistent with docs/source/community/sponsors.md -->
|
||||||
|
|
||||||
|
- a16z
|
||||||
|
- AMD
|
||||||
|
- Anyscale
|
||||||
|
- AWS
|
||||||
|
- Crusoe Cloud
|
||||||
|
- Databricks
|
||||||
|
- DeepInfra
|
||||||
|
- Dropbox
|
||||||
|
- Lambda Lab
|
||||||
|
- NVIDIA
|
||||||
|
- Replicate
|
||||||
|
- Roblox
|
||||||
|
- RunPod
|
||||||
|
- Sequoia Capital
|
||||||
|
- Trainy
|
||||||
|
- UC Berkeley
|
||||||
|
- UC San Diego
|
||||||
|
|
||||||
|
We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
|
||||||
|
|
||||||
## Citation
|
## Citation
|
||||||
|
|
||||||
If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
|
If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
|
||||||
|
|||||||
395
benchmarks/backend_request_func.py
Normal file
395
benchmarks/backend_request_func.py
Normal file
@ -0,0 +1,395 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import traceback
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
import aiohttp
|
||||||
|
from tqdm.asyncio import tqdm
|
||||||
|
|
||||||
|
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RequestFuncInput:
|
||||||
|
prompt: str
|
||||||
|
api_url: str
|
||||||
|
prompt_len: int
|
||||||
|
output_len: int
|
||||||
|
model: str
|
||||||
|
best_of: int = 1
|
||||||
|
use_beam_search: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RequestFuncOutput:
|
||||||
|
generated_text: str = ""
|
||||||
|
success: bool = False
|
||||||
|
latency: float = 0.0
|
||||||
|
ttft: float = 0.0 # Time to first token
|
||||||
|
itl: List[float] = field(
|
||||||
|
default_factory=list) # List of inter-token latencies
|
||||||
|
prompt_len: int = 0
|
||||||
|
error: str = ""
|
||||||
|
|
||||||
|
|
||||||
|
async def async_request_tgi(
|
||||||
|
request_func_input: RequestFuncInput,
|
||||||
|
pbar: Optional[tqdm] = None,
|
||||||
|
) -> RequestFuncOutput:
|
||||||
|
api_url = request_func_input.api_url
|
||||||
|
assert api_url.endswith("generate_stream")
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
|
assert not request_func_input.use_beam_search
|
||||||
|
params = {
|
||||||
|
"best_of": request_func_input.best_of,
|
||||||
|
"max_new_tokens": request_func_input.output_len,
|
||||||
|
"do_sample": True,
|
||||||
|
"temperature": 0.01, # TGI does not accept 0.0 temperature.
|
||||||
|
"top_p": 0.99, # TGI does not accept 1.0 top_p.
|
||||||
|
}
|
||||||
|
payload = {
|
||||||
|
"inputs": request_func_input.prompt,
|
||||||
|
"parameters": params,
|
||||||
|
}
|
||||||
|
output = RequestFuncOutput()
|
||||||
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
|
ttft = 0.0
|
||||||
|
st = time.perf_counter()
|
||||||
|
most_recent_timestamp = st
|
||||||
|
try:
|
||||||
|
async with session.post(url=api_url, json=payload) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
async for chunk_bytes in response.content:
|
||||||
|
chunk_bytes = chunk_bytes.strip()
|
||||||
|
if not chunk_bytes:
|
||||||
|
continue
|
||||||
|
|
||||||
|
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
|
||||||
|
"data:")
|
||||||
|
|
||||||
|
data = json.loads(chunk)
|
||||||
|
timestamp = time.perf_counter()
|
||||||
|
# First token
|
||||||
|
if ttft == 0.0:
|
||||||
|
ttft = time.perf_counter() - st
|
||||||
|
output.ttft = ttft
|
||||||
|
|
||||||
|
# Decoding phase
|
||||||
|
else:
|
||||||
|
output.itl.append(timestamp -
|
||||||
|
most_recent_timestamp)
|
||||||
|
|
||||||
|
most_recent_timestamp = timestamp
|
||||||
|
|
||||||
|
output.latency = most_recent_timestamp - st
|
||||||
|
output.success = True
|
||||||
|
output.generated_text = data["generated_text"]
|
||||||
|
else:
|
||||||
|
output.error = response.reason or ""
|
||||||
|
output.success = False
|
||||||
|
except Exception:
|
||||||
|
output.success = False
|
||||||
|
exc_info = sys.exc_info()
|
||||||
|
output.error = "".join(traceback.format_exception(*exc_info))
|
||||||
|
|
||||||
|
if pbar:
|
||||||
|
pbar.update(1)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
async def async_request_trt_llm(
|
||||||
|
request_func_input: RequestFuncInput,
|
||||||
|
pbar: Optional[tqdm] = None,
|
||||||
|
) -> RequestFuncOutput:
|
||||||
|
api_url = request_func_input.api_url
|
||||||
|
assert api_url.endswith("generate_stream")
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
|
assert not request_func_input.use_beam_search
|
||||||
|
assert request_func_input.best_of == 1
|
||||||
|
payload = {
|
||||||
|
"accumulate_tokens": True,
|
||||||
|
"text_input": request_func_input.prompt,
|
||||||
|
"temperature": 0.0,
|
||||||
|
"top_p": 1.0,
|
||||||
|
"max_tokens": request_func_input.output_len,
|
||||||
|
"stream": True,
|
||||||
|
}
|
||||||
|
output = RequestFuncOutput()
|
||||||
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
|
ttft = 0.0
|
||||||
|
st = time.perf_counter()
|
||||||
|
most_recent_timestamp = st
|
||||||
|
try:
|
||||||
|
async with session.post(url=api_url, json=payload) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
async for chunk_bytes in response.content:
|
||||||
|
chunk_bytes = chunk_bytes.strip()
|
||||||
|
if not chunk_bytes:
|
||||||
|
continue
|
||||||
|
|
||||||
|
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
|
||||||
|
"data:")
|
||||||
|
|
||||||
|
data = json.loads(chunk)
|
||||||
|
output.generated_text += data["text_output"]
|
||||||
|
timestamp = time.perf_counter()
|
||||||
|
# First token
|
||||||
|
if ttft == 0.0:
|
||||||
|
ttft = time.perf_counter() - st
|
||||||
|
output.ttft = ttft
|
||||||
|
|
||||||
|
# Decoding phase
|
||||||
|
else:
|
||||||
|
output.itl.append(timestamp -
|
||||||
|
most_recent_timestamp)
|
||||||
|
|
||||||
|
most_recent_timestamp = timestamp
|
||||||
|
|
||||||
|
output.latency = most_recent_timestamp - st
|
||||||
|
output.success = True
|
||||||
|
|
||||||
|
else:
|
||||||
|
output.error = response.reason or ""
|
||||||
|
output.success = False
|
||||||
|
except Exception:
|
||||||
|
output.success = False
|
||||||
|
exc_info = sys.exc_info()
|
||||||
|
output.error = "".join(traceback.format_exception(*exc_info))
|
||||||
|
|
||||||
|
if pbar:
|
||||||
|
pbar.update(1)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
async def async_request_deepspeed_mii(
|
||||||
|
request_func_input: RequestFuncInput,
|
||||||
|
pbar: Optional[tqdm] = None,
|
||||||
|
) -> RequestFuncOutput:
|
||||||
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
|
assert request_func_input.best_of == 1
|
||||||
|
assert not request_func_input.use_beam_search
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"prompt": request_func_input.prompt,
|
||||||
|
"max_tokens": request_func_input.output_len,
|
||||||
|
"temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
|
||||||
|
"top_p": 1.0,
|
||||||
|
}
|
||||||
|
output = RequestFuncOutput()
|
||||||
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
|
# NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
|
||||||
|
# will use 0 as placeholder.
|
||||||
|
# See https://github.com/microsoft/DeepSpeed-MII/pull/311
|
||||||
|
output.ttft = 0
|
||||||
|
|
||||||
|
st = time.perf_counter()
|
||||||
|
try:
|
||||||
|
async with session.post(url=request_func_input.api_url,
|
||||||
|
json=payload) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
parsed_resp = await response.json()
|
||||||
|
output.latency = time.perf_counter() - st
|
||||||
|
output.generated_text = parsed_resp["text"][0]
|
||||||
|
output.success = True
|
||||||
|
else:
|
||||||
|
output.error = response.reason or ""
|
||||||
|
output.success = False
|
||||||
|
except Exception:
|
||||||
|
output.success = False
|
||||||
|
exc_info = sys.exc_info()
|
||||||
|
output.error = "".join(traceback.format_exception(*exc_info))
|
||||||
|
|
||||||
|
if pbar:
|
||||||
|
pbar.update(1)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
async def async_request_openai_completions(
|
||||||
|
request_func_input: RequestFuncInput,
|
||||||
|
pbar: Optional[tqdm] = None,
|
||||||
|
) -> RequestFuncOutput:
|
||||||
|
api_url = request_func_input.api_url
|
||||||
|
assert api_url.endswith(
|
||||||
|
"v1/completions"
|
||||||
|
), "OpenAI Completions API URL must end with 'v1/completions'."
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
|
assert not request_func_input.use_beam_search
|
||||||
|
payload = {
|
||||||
|
"model": request_func_input.model,
|
||||||
|
"prompt": request_func_input.prompt,
|
||||||
|
"temperature": 0.0,
|
||||||
|
"best_of": request_func_input.best_of,
|
||||||
|
"max_tokens": request_func_input.output_len,
|
||||||
|
"stream": True,
|
||||||
|
}
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
|
||||||
|
}
|
||||||
|
|
||||||
|
output = RequestFuncOutput()
|
||||||
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
|
generated_text = ""
|
||||||
|
ttft = 0.0
|
||||||
|
st = time.perf_counter()
|
||||||
|
most_recent_timestamp = st
|
||||||
|
try:
|
||||||
|
async with session.post(url=api_url, json=payload,
|
||||||
|
headers=headers) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
async for chunk_bytes in response.content:
|
||||||
|
chunk_bytes = chunk_bytes.strip()
|
||||||
|
if not chunk_bytes:
|
||||||
|
continue
|
||||||
|
|
||||||
|
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
|
||||||
|
"data: ")
|
||||||
|
if chunk == "[DONE]":
|
||||||
|
latency = time.perf_counter() - st
|
||||||
|
else:
|
||||||
|
data = json.loads(chunk)
|
||||||
|
|
||||||
|
if data["choices"][0]["text"]:
|
||||||
|
timestamp = time.perf_counter()
|
||||||
|
# First token
|
||||||
|
if ttft == 0.0:
|
||||||
|
ttft = time.perf_counter() - st
|
||||||
|
output.ttft = ttft
|
||||||
|
|
||||||
|
# Decoding phase
|
||||||
|
# NOTE: Some completion API might have a last
|
||||||
|
# usage summary response without a token so we
|
||||||
|
# do not want to include as inter-token-latency
|
||||||
|
elif data.get("usage", None) is None:
|
||||||
|
output.itl.append(timestamp -
|
||||||
|
most_recent_timestamp)
|
||||||
|
|
||||||
|
most_recent_timestamp = timestamp
|
||||||
|
generated_text += data["choices"][0]["text"]
|
||||||
|
|
||||||
|
output.generated_text = generated_text
|
||||||
|
output.success = True
|
||||||
|
output.latency = latency
|
||||||
|
else:
|
||||||
|
output.error = response.reason or ""
|
||||||
|
output.success = False
|
||||||
|
except Exception:
|
||||||
|
output.success = False
|
||||||
|
exc_info = sys.exc_info()
|
||||||
|
output.error = "".join(traceback.format_exception(*exc_info))
|
||||||
|
|
||||||
|
if pbar:
|
||||||
|
pbar.update(1)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
async def async_request_openai_chat_completions(
|
||||||
|
request_func_input: RequestFuncInput,
|
||||||
|
pbar: Optional[tqdm] = None,
|
||||||
|
) -> RequestFuncOutput:
|
||||||
|
api_url = request_func_input.api_url
|
||||||
|
assert api_url.endswith(
|
||||||
|
"v1/chat/completions"
|
||||||
|
), "OpenAI Chat Completions API URL must end with 'v1/chat/completions'."
|
||||||
|
|
||||||
|
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
||||||
|
assert not request_func_input.use_beam_search
|
||||||
|
payload = {
|
||||||
|
"model": request_func_input.model,
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": request_func_input.prompt,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"temperature": 0.0,
|
||||||
|
"max_tokens": request_func_input.output_len,
|
||||||
|
"stream": True,
|
||||||
|
}
|
||||||
|
headers = {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
|
||||||
|
}
|
||||||
|
|
||||||
|
output = RequestFuncOutput()
|
||||||
|
output.prompt_len = request_func_input.prompt_len
|
||||||
|
|
||||||
|
generated_text = ""
|
||||||
|
ttft = 0.0
|
||||||
|
st = time.perf_counter()
|
||||||
|
most_recent_timestamp = st
|
||||||
|
try:
|
||||||
|
async with session.post(url=api_url, json=payload,
|
||||||
|
headers=headers) as response:
|
||||||
|
if response.status == 200:
|
||||||
|
async for chunk_bytes in response.content:
|
||||||
|
chunk_bytes = chunk_bytes.strip()
|
||||||
|
if not chunk_bytes:
|
||||||
|
continue
|
||||||
|
|
||||||
|
chunk = remove_prefix(chunk_bytes.decode("utf-8"),
|
||||||
|
"data: ")
|
||||||
|
if chunk == "[DONE]":
|
||||||
|
latency = time.perf_counter() - st
|
||||||
|
else:
|
||||||
|
timestamp = time.perf_counter()
|
||||||
|
data = json.loads(chunk)
|
||||||
|
|
||||||
|
delta = data["choices"][0]["delta"]
|
||||||
|
if delta.get("content", None):
|
||||||
|
# First token
|
||||||
|
if ttft == 0.0:
|
||||||
|
ttft = time.perf_counter() - st
|
||||||
|
output.ttft = ttft
|
||||||
|
|
||||||
|
# Decoding phase
|
||||||
|
else:
|
||||||
|
output.itl.append(timestamp -
|
||||||
|
most_recent_timestamp)
|
||||||
|
|
||||||
|
generated_text += delta["content"]
|
||||||
|
|
||||||
|
most_recent_timestamp = timestamp
|
||||||
|
|
||||||
|
output.generated_text = generated_text
|
||||||
|
output.success = True
|
||||||
|
output.latency = latency
|
||||||
|
else:
|
||||||
|
output.error = response.reason or ""
|
||||||
|
output.success = False
|
||||||
|
except Exception:
|
||||||
|
output.success = False
|
||||||
|
exc_info = sys.exc_info()
|
||||||
|
output.error = "".join(traceback.format_exception(*exc_info))
|
||||||
|
|
||||||
|
if pbar:
|
||||||
|
pbar.update(1)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
# Since vllm must support Python 3.8, we can't use str.removeprefix(prefix)
|
||||||
|
# introduced in Python 3.9
|
||||||
|
def remove_prefix(text: str, prefix: str) -> str:
|
||||||
|
if text.startswith(prefix):
|
||||||
|
return text[len(prefix):]
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
ASYNC_REQUEST_FUNCS = {
|
||||||
|
"tgi": async_request_tgi,
|
||||||
|
"vllm": async_request_openai_completions,
|
||||||
|
"lmdeploy": async_request_openai_completions,
|
||||||
|
"deepspeed-mii": async_request_deepspeed_mii,
|
||||||
|
"openai": async_request_openai_completions,
|
||||||
|
"openai-chat": async_request_openai_chat_completions,
|
||||||
|
"tensorrt-llm": async_request_trt_llm,
|
||||||
|
}
|
||||||
@ -1,30 +1,43 @@
|
|||||||
"""Benchmark the latency of processing a single batch of requests."""
|
"""Benchmark the latency of processing a single batch of requests."""
|
||||||
import argparse
|
import argparse
|
||||||
|
import json
|
||||||
import time
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
|
from vllm.inputs import PromptStrictInputs
|
||||||
|
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||||
|
|
||||||
|
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
print(args)
|
print(args)
|
||||||
|
|
||||||
# Process all the requests in a single batch if possible.
|
|
||||||
# NOTE(woosuk): If the request cannot be processed in a single batch,
|
# NOTE(woosuk): If the request cannot be processed in a single batch,
|
||||||
# the engine will automatically process the request in multiple batches.
|
# the engine will automatically process the request in multiple batches.
|
||||||
llm = LLM(
|
llm = LLM(model=args.model,
|
||||||
model=args.model,
|
speculative_model=args.speculative_model,
|
||||||
tokenizer=args.tokenizer,
|
num_speculative_tokens=args.num_speculative_tokens,
|
||||||
quantization=args.quantization,
|
tokenizer=args.tokenizer,
|
||||||
tensor_parallel_size=args.tensor_parallel_size,
|
quantization=args.quantization,
|
||||||
max_num_seqs=args.batch_size,
|
tensor_parallel_size=args.tensor_parallel_size,
|
||||||
max_num_batched_tokens=args.batch_size * args.input_len,
|
trust_remote_code=args.trust_remote_code,
|
||||||
trust_remote_code=args.trust_remote_code,
|
dtype=args.dtype,
|
||||||
dtype=args.dtype,
|
enforce_eager=args.enforce_eager,
|
||||||
)
|
kv_cache_dtype=args.kv_cache_dtype,
|
||||||
|
quantization_param_path=args.quantization_param_path,
|
||||||
|
device=args.device,
|
||||||
|
ray_workers_use_nsight=args.ray_workers_use_nsight,
|
||||||
|
use_v2_block_manager=args.use_v2_block_manager,
|
||||||
|
enable_chunked_prefill=args.enable_chunked_prefill,
|
||||||
|
download_dir=args.download_dir,
|
||||||
|
block_size=args.block_size,
|
||||||
|
gpu_memory_utilization=args.gpu_memory_utilization,
|
||||||
|
distributed_executor_backend=args.distributed_executor_backend)
|
||||||
|
|
||||||
sampling_params = SamplingParams(
|
sampling_params = SamplingParams(
|
||||||
n=args.n,
|
n=args.n,
|
||||||
@ -35,31 +48,69 @@ def main(args: argparse.Namespace):
|
|||||||
max_tokens=args.output_len,
|
max_tokens=args.output_len,
|
||||||
)
|
)
|
||||||
print(sampling_params)
|
print(sampling_params)
|
||||||
dummy_prompt_token_ids = [[0] * args.input_len] * args.batch_size
|
dummy_prompt_token_ids = np.random.randint(10000,
|
||||||
|
size=(args.batch_size,
|
||||||
|
args.input_len))
|
||||||
|
dummy_inputs: List[PromptStrictInputs] = [{
|
||||||
|
"prompt_token_ids": batch
|
||||||
|
} for batch in dummy_prompt_token_ids.tolist()]
|
||||||
|
|
||||||
def run_to_completion(profile: bool = False):
|
def run_to_completion(profile_dir: Optional[str] = None):
|
||||||
if profile:
|
if profile_dir:
|
||||||
torch.cuda.cudart().cudaProfilerStart()
|
with torch.profiler.profile(
|
||||||
start_time = time.perf_counter()
|
activities=[
|
||||||
|
torch.profiler.ProfilerActivity.CPU,
|
||||||
llm.generate(prompt_token_ids=dummy_prompt_token_ids,
|
torch.profiler.ProfilerActivity.CUDA,
|
||||||
sampling_params=sampling_params,
|
],
|
||||||
use_tqdm=False)
|
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||||
|
str(profile_dir))) as p:
|
||||||
end_time = time.perf_counter()
|
llm.generate(dummy_inputs,
|
||||||
latency = end_time - start_time
|
sampling_params=sampling_params,
|
||||||
if profile:
|
use_tqdm=False)
|
||||||
torch.cuda.cudart().cudaProfilerStop()
|
print(p.key_averages())
|
||||||
return latency
|
else:
|
||||||
|
start_time = time.perf_counter()
|
||||||
|
llm.generate(dummy_inputs,
|
||||||
|
sampling_params=sampling_params,
|
||||||
|
use_tqdm=False)
|
||||||
|
end_time = time.perf_counter()
|
||||||
|
latency = end_time - start_time
|
||||||
|
return latency
|
||||||
|
|
||||||
print("Warming up...")
|
print("Warming up...")
|
||||||
run_to_completion(profile=False)
|
for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
|
||||||
|
run_to_completion(profile_dir=None)
|
||||||
|
|
||||||
|
if args.profile:
|
||||||
|
profile_dir = args.profile_result_dir
|
||||||
|
if not profile_dir:
|
||||||
|
profile_dir = Path(
|
||||||
|
"."
|
||||||
|
) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
|
||||||
|
print(f"Profiling (results will be saved to '{profile_dir}')...")
|
||||||
|
run_to_completion(profile_dir=profile_dir)
|
||||||
|
return
|
||||||
|
|
||||||
# Benchmark.
|
# Benchmark.
|
||||||
latencies = []
|
latencies = []
|
||||||
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
|
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
|
||||||
latencies.append(run_to_completion(profile=False))
|
latencies.append(run_to_completion(profile_dir=None))
|
||||||
|
latencies = np.array(latencies)
|
||||||
|
percentages = [10, 25, 50, 75, 90]
|
||||||
|
percentiles = np.percentile(latencies, percentages)
|
||||||
print(f'Avg latency: {np.mean(latencies)} seconds')
|
print(f'Avg latency: {np.mean(latencies)} seconds')
|
||||||
|
for percentage, percentile in zip(percentages, percentiles):
|
||||||
|
print(f'{percentage}% percentile latency: {percentile} seconds')
|
||||||
|
|
||||||
|
# Output JSON results if specified
|
||||||
|
if args.output_json:
|
||||||
|
results = {
|
||||||
|
"avg_latency": np.mean(latencies),
|
||||||
|
"latencies": latencies.tolist(),
|
||||||
|
"percentiles": dict(zip(percentages, percentiles.tolist())),
|
||||||
|
}
|
||||||
|
with open(args.output_json, "w") as f:
|
||||||
|
json.dump(results, f, indent=4)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
@ -67,10 +118,12 @@ if __name__ == '__main__':
|
|||||||
description='Benchmark the latency of processing a single batch of '
|
description='Benchmark the latency of processing a single batch of '
|
||||||
'requests till completion.')
|
'requests till completion.')
|
||||||
parser.add_argument('--model', type=str, default='facebook/opt-125m')
|
parser.add_argument('--model', type=str, default='facebook/opt-125m')
|
||||||
|
parser.add_argument('--speculative-model', type=str, default=None)
|
||||||
|
parser.add_argument('--num-speculative-tokens', type=int, default=None)
|
||||||
parser.add_argument('--tokenizer', type=str, default=None)
|
parser.add_argument('--tokenizer', type=str, default=None)
|
||||||
parser.add_argument('--quantization',
|
parser.add_argument('--quantization',
|
||||||
'-q',
|
'-q',
|
||||||
choices=['awq', 'squeezellm', None],
|
choices=[*QUANTIZATION_METHODS, None],
|
||||||
default=None)
|
default=None)
|
||||||
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
|
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
|
||||||
parser.add_argument('--input-len', type=int, default=32)
|
parser.add_argument('--input-len', type=int, default=32)
|
||||||
@ -81,9 +134,13 @@ if __name__ == '__main__':
|
|||||||
default=1,
|
default=1,
|
||||||
help='Number of generated sequences per prompt.')
|
help='Number of generated sequences per prompt.')
|
||||||
parser.add_argument('--use-beam-search', action='store_true')
|
parser.add_argument('--use-beam-search', action='store_true')
|
||||||
|
parser.add_argument('--num-iters-warmup',
|
||||||
|
type=int,
|
||||||
|
default=10,
|
||||||
|
help='Number of iterations to run for warmup.')
|
||||||
parser.add_argument('--num-iters',
|
parser.add_argument('--num-iters',
|
||||||
type=int,
|
type=int,
|
||||||
default=3,
|
default=30,
|
||||||
help='Number of iterations to run.')
|
help='Number of iterations to run.')
|
||||||
parser.add_argument('--trust-remote-code',
|
parser.add_argument('--trust-remote-code',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
@ -97,5 +154,80 @@ if __name__ == '__main__':
|
|||||||
'The "auto" option will use FP16 precision '
|
'The "auto" option will use FP16 precision '
|
||||||
'for FP32 and FP16 models, and BF16 precision '
|
'for FP32 and FP16 models, and BF16 precision '
|
||||||
'for BF16 models.')
|
'for BF16 models.')
|
||||||
|
parser.add_argument('--enforce-eager',
|
||||||
|
action='store_true',
|
||||||
|
help='enforce eager mode and disable CUDA graph')
|
||||||
|
parser.add_argument(
|
||||||
|
'--kv-cache-dtype',
|
||||||
|
type=str,
|
||||||
|
choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
|
||||||
|
default="auto",
|
||||||
|
help='Data type for kv cache storage. If "auto", will use model '
|
||||||
|
'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
|
||||||
|
'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
|
||||||
|
parser.add_argument(
|
||||||
|
'--quantization-param-path',
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help='Path to the JSON file containing the KV cache scaling factors. '
|
||||||
|
'This should generally be supplied, when KV cache dtype is FP8. '
|
||||||
|
'Otherwise, KV cache scaling factors default to 1.0, which may cause '
|
||||||
|
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
|
||||||
|
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
|
||||||
|
'instead supported for common inference criteria.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--profile',
|
||||||
|
action='store_true',
|
||||||
|
help='profile the generation process of a single batch')
|
||||||
|
parser.add_argument(
|
||||||
|
'--profile-result-dir',
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help=('path to save the pytorch profiler output. Can be visualized '
|
||||||
|
'with ui.perfetto.dev or Tensorboard.'))
|
||||||
|
parser.add_argument(
|
||||||
|
"--device",
|
||||||
|
type=str,
|
||||||
|
default="cuda",
|
||||||
|
choices=["cuda", "cpu"],
|
||||||
|
help='device type for vLLM execution, supporting CUDA and CPU.')
|
||||||
|
parser.add_argument('--block-size',
|
||||||
|
type=int,
|
||||||
|
default=16,
|
||||||
|
help='block size of key/value cache')
|
||||||
|
parser.add_argument(
|
||||||
|
'--enable-chunked-prefill',
|
||||||
|
action='store_true',
|
||||||
|
help='If True, the prefill requests can be chunked based on the '
|
||||||
|
'max_num_batched_tokens')
|
||||||
|
parser.add_argument('--use-v2-block-manager', action='store_true')
|
||||||
|
parser.add_argument(
|
||||||
|
"--ray-workers-use-nsight",
|
||||||
|
action='store_true',
|
||||||
|
help="If specified, use nsight to profile ray workers",
|
||||||
|
)
|
||||||
|
parser.add_argument('--download-dir',
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help='directory to download and load the weights, '
|
||||||
|
'default to the default cache dir of huggingface')
|
||||||
|
parser.add_argument(
|
||||||
|
'--output-json',
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help='Path to save the latency results in JSON format.')
|
||||||
|
parser.add_argument('--gpu-memory-utilization',
|
||||||
|
type=float,
|
||||||
|
default=0.9,
|
||||||
|
help='the fraction of GPU memory to be used for '
|
||||||
|
'the model executor, which can range from 0 to 1.'
|
||||||
|
'If unspecified, will use the default value of 0.9.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--distributed-executor-backend',
|
||||||
|
choices=['ray', 'mp'],
|
||||||
|
default=None,
|
||||||
|
help='Backend to use for distributed serving. When more than 1 GPU '
|
||||||
|
'is used, will be automatically set to "ray" if installed '
|
||||||
|
'or "mp" (multiprocessing) otherwise.')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
62
benchmarks/benchmark_prefix_caching.py
Normal file
62
benchmarks/benchmark_prefix_caching.py
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
import argparse
|
||||||
|
import time
|
||||||
|
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
|
PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501
|
||||||
|
|
||||||
|
|
||||||
|
def test_prefix(llm=None, sampling_params=None, prompts=None):
|
||||||
|
start_time = time.time()
|
||||||
|
|
||||||
|
llm.generate(prompts, sampling_params=sampling_params)
|
||||||
|
|
||||||
|
end_time = time.time()
|
||||||
|
print(f"cost time {end_time - start_time}")
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
llm = LLM(model=args.model,
|
||||||
|
tokenizer_mode='auto',
|
||||||
|
trust_remote_code=True,
|
||||||
|
enforce_eager=True,
|
||||||
|
use_v2_block_manager=args.use_v2_block_manager,
|
||||||
|
tensor_parallel_size=args.tensor_parallel_size,
|
||||||
|
enable_prefix_caching=args.enable_prefix_caching)
|
||||||
|
|
||||||
|
num_prompts = 100
|
||||||
|
prompts = [PROMPT] * num_prompts
|
||||||
|
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
|
||||||
|
|
||||||
|
print("------warm up------")
|
||||||
|
test_prefix(
|
||||||
|
llm=llm,
|
||||||
|
prompts=prompts,
|
||||||
|
sampling_params=sampling_params,
|
||||||
|
)
|
||||||
|
|
||||||
|
print("------start generating------")
|
||||||
|
test_prefix(
|
||||||
|
llm=llm,
|
||||||
|
prompts=prompts,
|
||||||
|
sampling_params=sampling_params,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description='Benchmark the performance with or without automatic '
|
||||||
|
'prefix caching.')
|
||||||
|
parser.add_argument('--model',
|
||||||
|
type=str,
|
||||||
|
default='baichuan-inc/Baichuan2-13B-Chat')
|
||||||
|
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
|
||||||
|
parser.add_argument('--output-len', type=int, default=10)
|
||||||
|
parser.add_argument('--enable-prefix-caching',
|
||||||
|
action='store_true',
|
||||||
|
help='enable prefix caching')
|
||||||
|
parser.add_argument('--use-v2-block-manager',
|
||||||
|
action='store_true',
|
||||||
|
help='Use BlockSpaceMangerV2')
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args)
|
||||||
@ -1,81 +1,179 @@
|
|||||||
"""Benchmark online serving throughput.
|
"""Benchmark online serving throughput.
|
||||||
|
|
||||||
On the server side, run one of the following commands:
|
On the server side, run one of the following commands:
|
||||||
(vLLM backend)
|
vLLM OpenAI API server
|
||||||
python -m vllm.entrypoints.api_server \
|
python -m vllm.entrypoints.openai.api_server \
|
||||||
--model <your_model> --swap-space 16 \
|
--model <your_model> --swap-space 16 \
|
||||||
--disable-log-requests
|
--disable-log-requests
|
||||||
|
|
||||||
(TGI backend)
|
(TGI backend)
|
||||||
./launch_hf_server.sh <your_model>
|
./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
|
||||||
|
|
||||||
On the client side, run:
|
On the client side, run:
|
||||||
python benchmarks/benchmark_serving.py \
|
python benchmarks/benchmark_serving.py \
|
||||||
--backend <backend> \
|
--backend <backend> \
|
||||||
--tokenizer <your_model> --dataset <target_dataset> \
|
--model <your_model> \
|
||||||
--request-rate <request_rate>
|
--dataset-name sharegpt \
|
||||||
|
--dataset-path <path to dataset> \
|
||||||
|
--request-rate <request_rate> \ # By default <request_rate> is inf
|
||||||
|
--num-prompts <num_prompts> # By default <num_prompts> is 1000
|
||||||
|
|
||||||
|
when using tgi backend, add
|
||||||
|
--endpoint /generate_stream
|
||||||
|
to the end of the command above.
|
||||||
"""
|
"""
|
||||||
import argparse
|
import argparse
|
||||||
import asyncio
|
import asyncio
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
from typing import AsyncGenerator, List, Tuple
|
import warnings
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import AsyncGenerator, List, Optional, Tuple
|
||||||
|
|
||||||
import aiohttp
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
|
||||||
|
RequestFuncOutput)
|
||||||
|
from tqdm.asyncio import tqdm
|
||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
|
|
||||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||||
|
|
||||||
# (prompt len, output len, latency)
|
|
||||||
REQUEST_LATENCY: List[Tuple[int, int, float]] = []
|
@dataclass
|
||||||
|
class BenchmarkMetrics:
|
||||||
|
completed: int
|
||||||
|
total_input: int
|
||||||
|
total_output: int
|
||||||
|
request_throughput: float
|
||||||
|
input_throughput: float
|
||||||
|
output_throughput: float
|
||||||
|
mean_ttft_ms: float
|
||||||
|
median_ttft_ms: float
|
||||||
|
p99_ttft_ms: float
|
||||||
|
mean_tpot_ms: float
|
||||||
|
median_tpot_ms: float
|
||||||
|
p99_tpot_ms: float
|
||||||
|
mean_itl_ms: float
|
||||||
|
median_itl_ms: float
|
||||||
|
p99_itl_ms: float
|
||||||
|
|
||||||
|
|
||||||
def sample_requests(
|
def sample_sharegpt_requests(
|
||||||
dataset_path: str,
|
dataset_path: str,
|
||||||
num_requests: int,
|
num_requests: int,
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
|
fixed_output_len: Optional[int] = None,
|
||||||
) -> List[Tuple[str, int, int]]:
|
) -> List[Tuple[str, int, int]]:
|
||||||
|
if fixed_output_len is not None and fixed_output_len < 4:
|
||||||
|
raise ValueError("output_len too small")
|
||||||
|
|
||||||
# Load the dataset.
|
# Load the dataset.
|
||||||
with open(dataset_path) as f:
|
with open(dataset_path) as f:
|
||||||
dataset = json.load(f)
|
dataset = json.load(f)
|
||||||
# Filter out the conversations with less than 2 turns.
|
# Filter out the conversations with less than 2 turns.
|
||||||
dataset = [
|
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
|
||||||
data for data in dataset
|
|
||||||
if len(data["conversations"]) >= 2
|
|
||||||
]
|
|
||||||
# Only keep the first two turns of each conversation.
|
# Only keep the first two turns of each conversation.
|
||||||
dataset = [
|
dataset = [(data["conversations"][0]["value"],
|
||||||
(data["conversations"][0]["value"], data["conversations"][1]["value"])
|
data["conversations"][1]["value"]) for data in dataset]
|
||||||
for data in dataset
|
|
||||||
]
|
|
||||||
|
|
||||||
# Tokenize the prompts and completions.
|
# Shuffle the dataset.
|
||||||
prompts = [prompt for prompt, _ in dataset]
|
random.shuffle(dataset)
|
||||||
prompt_token_ids = tokenizer(prompts).input_ids
|
|
||||||
completions = [completion for _, completion in dataset]
|
|
||||||
completion_token_ids = tokenizer(completions).input_ids
|
|
||||||
tokenized_dataset = []
|
|
||||||
for i in range(len(dataset)):
|
|
||||||
output_len = len(completion_token_ids[i])
|
|
||||||
tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
|
|
||||||
|
|
||||||
# Filter out too long sequences.
|
# Filter out sequences that are too long or too short
|
||||||
filtered_dataset: List[Tuple[str, int, int]] = []
|
filtered_dataset: List[Tuple[str, int, int]] = []
|
||||||
for prompt, prompt_token_ids, output_len in tokenized_dataset:
|
for i in range(len(dataset)):
|
||||||
|
if len(filtered_dataset) == num_requests:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Tokenize the prompts and completions.
|
||||||
|
prompt = dataset[i][0]
|
||||||
|
prompt_token_ids = tokenizer(prompt).input_ids
|
||||||
|
completion = dataset[i][1]
|
||||||
|
completion_token_ids = tokenizer(completion).input_ids
|
||||||
prompt_len = len(prompt_token_ids)
|
prompt_len = len(prompt_token_ids)
|
||||||
|
output_len = len(completion_token_ids
|
||||||
|
) if fixed_output_len is None else fixed_output_len
|
||||||
if prompt_len < 4 or output_len < 4:
|
if prompt_len < 4 or output_len < 4:
|
||||||
# Prune too short sequences.
|
# Prune too short sequences.
|
||||||
# This is because TGI causes errors when the input or output length
|
|
||||||
# is too short.
|
|
||||||
continue
|
continue
|
||||||
if prompt_len > 1024 or prompt_len + output_len > 2048:
|
if prompt_len > 1024 or prompt_len + output_len > 2048:
|
||||||
# Prune too long sequences.
|
# Prune too long sequences.
|
||||||
continue
|
continue
|
||||||
filtered_dataset.append((prompt, prompt_len, output_len))
|
filtered_dataset.append((prompt, prompt_len, output_len))
|
||||||
|
|
||||||
# Sample the requests.
|
return filtered_dataset
|
||||||
sampled_requests = random.sample(filtered_dataset, num_requests)
|
|
||||||
|
|
||||||
|
def sample_sonnet_requests(
|
||||||
|
dataset_path: str,
|
||||||
|
num_requests: int,
|
||||||
|
input_len: int,
|
||||||
|
output_len: int,
|
||||||
|
prefix_len: int,
|
||||||
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
|
) -> List[Tuple[str, str, int, int]]:
|
||||||
|
assert (
|
||||||
|
input_len > prefix_len
|
||||||
|
), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
|
||||||
|
|
||||||
|
# Load the dataset.
|
||||||
|
with open(dataset_path) as f:
|
||||||
|
poem_lines = f.readlines()
|
||||||
|
|
||||||
|
# Tokenize the poem lines.
|
||||||
|
poem_token_ids = tokenizer(poem_lines).input_ids
|
||||||
|
average_poem_len = sum(
|
||||||
|
len(token_ids) for token_ids in poem_token_ids) / len(poem_token_ids)
|
||||||
|
|
||||||
|
# Base prefix for all requests.
|
||||||
|
base_prompt = "Pick as many lines as you can from these poem lines:\n"
|
||||||
|
base_message = [{
|
||||||
|
"role": "user",
|
||||||
|
"content": base_prompt,
|
||||||
|
}]
|
||||||
|
base_prompt_formatted = tokenizer.apply_chat_template(
|
||||||
|
base_message, add_generation_prompt=True, tokenize=False)
|
||||||
|
base_prompt_offset = len(tokenizer(base_prompt_formatted).input_ids)
|
||||||
|
|
||||||
|
assert (
|
||||||
|
input_len > base_prompt_offset
|
||||||
|
), f"Please set 'args.sonnet-input-len' higher than {base_prompt_offset}."
|
||||||
|
num_input_lines = round(
|
||||||
|
(input_len - base_prompt_offset) / average_poem_len)
|
||||||
|
|
||||||
|
# First approximately `prefix_len` number of tokens in the
|
||||||
|
# prompt are fixed poem lines.
|
||||||
|
assert (
|
||||||
|
prefix_len > base_prompt_offset
|
||||||
|
), f"Please set 'args.sonnet-prefix-len' higher than {base_prompt_offset}."
|
||||||
|
|
||||||
|
num_prefix_lines = round(
|
||||||
|
(prefix_len - base_prompt_offset) / average_poem_len)
|
||||||
|
prefix_lines = poem_lines[:num_prefix_lines]
|
||||||
|
|
||||||
|
# Sample the rest of lines per request.
|
||||||
|
sampled_requests: List[Tuple[str, int, int]] = []
|
||||||
|
for _ in range(num_requests):
|
||||||
|
sampled_lines = "".join(
|
||||||
|
prefix_lines +
|
||||||
|
random.sample(poem_lines, num_input_lines - num_prefix_lines))
|
||||||
|
|
||||||
|
prompt = f"{base_prompt}{sampled_lines}"
|
||||||
|
message = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": prompt,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
prompt_formatted = tokenizer.apply_chat_template(
|
||||||
|
message, add_generation_prompt=True, tokenize=False)
|
||||||
|
prompt_len = len(tokenizer(prompt_formatted).input_ids)
|
||||||
|
sampled_requests.append(
|
||||||
|
(prompt, prompt_formatted, prompt_len, output_len))
|
||||||
|
|
||||||
return sampled_requests
|
return sampled_requests
|
||||||
|
|
||||||
|
|
||||||
@ -96,79 +194,190 @@ async def get_request(
|
|||||||
await asyncio.sleep(interval)
|
await asyncio.sleep(interval)
|
||||||
|
|
||||||
|
|
||||||
async def send_request(
|
def calculate_metrics(
|
||||||
backend: str,
|
input_requests: List[Tuple[str, int, int]],
|
||||||
api_url: str,
|
outputs: List[RequestFuncOutput],
|
||||||
prompt: str,
|
dur_s: float,
|
||||||
prompt_len: int,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
output_len: int,
|
) -> Tuple[BenchmarkMetrics, List[int]]:
|
||||||
best_of: int,
|
actual_output_lens = []
|
||||||
use_beam_search: bool,
|
total_input = 0
|
||||||
) -> None:
|
completed = 0
|
||||||
request_start_time = time.perf_counter()
|
itls = []
|
||||||
|
tpots = []
|
||||||
|
ttfts = []
|
||||||
|
for i in range(len(outputs)):
|
||||||
|
if outputs[i].success:
|
||||||
|
# We use the tokenizer to count the number of output tokens for all
|
||||||
|
# serving backends instead of looking at len(outputs[i].itl) since
|
||||||
|
# multiple output tokens may be bundled together
|
||||||
|
# Note: this may inflate the output token count slightly
|
||||||
|
output_len = len(
|
||||||
|
tokenizer(outputs[i].generated_text,
|
||||||
|
add_special_tokens=False).input_ids)
|
||||||
|
actual_output_lens.append(output_len)
|
||||||
|
total_input += input_requests[i][1]
|
||||||
|
if output_len > 1:
|
||||||
|
tpots.append(
|
||||||
|
(outputs[i].latency - outputs[i].ttft) / (output_len - 1))
|
||||||
|
itls += outputs[i].itl
|
||||||
|
ttfts.append(outputs[i].ttft)
|
||||||
|
completed += 1
|
||||||
|
else:
|
||||||
|
actual_output_lens.append(0)
|
||||||
|
|
||||||
headers = {"User-Agent": "Benchmark Client"}
|
if completed == 0:
|
||||||
if backend == "vllm":
|
warnings.warn(
|
||||||
pload = {
|
"All requests failed. This is likely due to a misconfiguration "
|
||||||
"prompt": prompt,
|
"on the benchmark arguments.",
|
||||||
"n": 1,
|
stacklevel=2)
|
||||||
"best_of": best_of,
|
metrics = BenchmarkMetrics(
|
||||||
"use_beam_search": use_beam_search,
|
completed=completed,
|
||||||
"temperature": 0.0 if use_beam_search else 1.0,
|
total_input=total_input,
|
||||||
"top_p": 1.0,
|
total_output=sum(actual_output_lens),
|
||||||
"max_tokens": output_len,
|
request_throughput=completed / dur_s,
|
||||||
"ignore_eos": True,
|
input_throughput=total_input / dur_s,
|
||||||
"stream": False,
|
output_throughput=sum(actual_output_lens) / dur_s,
|
||||||
}
|
mean_ttft_ms=np.mean(ttfts or 0) *
|
||||||
elif backend == "tgi":
|
1000, # ttfts is empty if streaming is not supported by backend
|
||||||
assert not use_beam_search
|
median_ttft_ms=np.median(ttfts or 0) * 1000,
|
||||||
params = {
|
p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
|
||||||
"best_of": best_of,
|
mean_tpot_ms=np.mean(tpots or 0) * 1000,
|
||||||
"max_new_tokens": output_len,
|
median_tpot_ms=np.median(tpots or 0) * 1000,
|
||||||
"do_sample": True,
|
p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
|
||||||
}
|
mean_itl_ms=np.mean(itls or 0) * 1000,
|
||||||
pload = {
|
median_itl_ms=np.median(itls or 0) * 1000,
|
||||||
"inputs": prompt,
|
p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
|
||||||
"parameters": params,
|
)
|
||||||
}
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unknown backend: {backend}")
|
|
||||||
|
|
||||||
timeout = aiohttp.ClientTimeout(total=3 * 3600)
|
return metrics, actual_output_lens
|
||||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
|
||||||
while True:
|
|
||||||
async with session.post(api_url, headers=headers, json=pload) as response:
|
|
||||||
chunks = []
|
|
||||||
async for chunk, _ in response.content.iter_chunks():
|
|
||||||
chunks.append(chunk)
|
|
||||||
output = b"".join(chunks).decode("utf-8")
|
|
||||||
output = json.loads(output)
|
|
||||||
|
|
||||||
# Re-send the request if it failed.
|
|
||||||
if "error" not in output:
|
|
||||||
break
|
|
||||||
|
|
||||||
request_end_time = time.perf_counter()
|
|
||||||
request_latency = request_end_time - request_start_time
|
|
||||||
REQUEST_LATENCY.append((prompt_len, output_len, request_latency))
|
|
||||||
|
|
||||||
|
|
||||||
async def benchmark(
|
async def benchmark(
|
||||||
backend: str,
|
backend: str,
|
||||||
api_url: str,
|
api_url: str,
|
||||||
|
model_id: str,
|
||||||
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
input_requests: List[Tuple[str, int, int]],
|
input_requests: List[Tuple[str, int, int]],
|
||||||
best_of: int,
|
best_of: int,
|
||||||
use_beam_search: bool,
|
use_beam_search: bool,
|
||||||
request_rate: float,
|
request_rate: float,
|
||||||
) -> None:
|
disable_tqdm: bool,
|
||||||
tasks: List[asyncio.Task] = []
|
):
|
||||||
|
if backend in ASYNC_REQUEST_FUNCS:
|
||||||
|
request_func = ASYNC_REQUEST_FUNCS.get(backend)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown backend: {backend}")
|
||||||
|
|
||||||
|
print("Starting initial single prompt test run...")
|
||||||
|
test_prompt, test_prompt_len, test_output_len = input_requests[0]
|
||||||
|
test_input = RequestFuncInput(
|
||||||
|
model=model_id,
|
||||||
|
prompt=test_prompt,
|
||||||
|
api_url=api_url,
|
||||||
|
prompt_len=test_prompt_len,
|
||||||
|
output_len=test_output_len,
|
||||||
|
best_of=best_of,
|
||||||
|
use_beam_search=use_beam_search,
|
||||||
|
)
|
||||||
|
test_output = await request_func(request_func_input=test_input)
|
||||||
|
if not test_output.success:
|
||||||
|
raise ValueError(
|
||||||
|
"Initial test run failed - Please make sure benchmark arguments "
|
||||||
|
f"are correctly specified. Error: {test_output.error}")
|
||||||
|
else:
|
||||||
|
print("Initial test run completed. Starting main benchmark run...")
|
||||||
|
print(f"Traffic request rate: {request_rate}")
|
||||||
|
|
||||||
|
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
||||||
|
|
||||||
|
benchmark_start_time = time.perf_counter()
|
||||||
|
tasks = []
|
||||||
async for request in get_request(input_requests, request_rate):
|
async for request in get_request(input_requests, request_rate):
|
||||||
prompt, prompt_len, output_len = request
|
prompt, prompt_len, output_len = request
|
||||||
task = asyncio.create_task(send_request(backend, api_url, prompt,
|
request_func_input = RequestFuncInput(
|
||||||
prompt_len, output_len,
|
model=model_id,
|
||||||
best_of, use_beam_search))
|
prompt=prompt,
|
||||||
tasks.append(task)
|
api_url=api_url,
|
||||||
await asyncio.gather(*tasks)
|
prompt_len=prompt_len,
|
||||||
|
output_len=output_len,
|
||||||
|
best_of=best_of,
|
||||||
|
use_beam_search=use_beam_search,
|
||||||
|
)
|
||||||
|
tasks.append(
|
||||||
|
asyncio.create_task(
|
||||||
|
request_func(request_func_input=request_func_input,
|
||||||
|
pbar=pbar)))
|
||||||
|
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
if not disable_tqdm:
|
||||||
|
pbar.close()
|
||||||
|
|
||||||
|
benchmark_duration = time.perf_counter() - benchmark_start_time
|
||||||
|
|
||||||
|
metrics, actual_output_lens = calculate_metrics(
|
||||||
|
input_requests=input_requests,
|
||||||
|
outputs=outputs,
|
||||||
|
dur_s=benchmark_duration,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
)
|
||||||
|
|
||||||
|
print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
|
||||||
|
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
|
||||||
|
print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
|
||||||
|
benchmark_duration))
|
||||||
|
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
|
||||||
|
print("{:<40} {:<10}".format("Total generated tokens:",
|
||||||
|
metrics.total_output))
|
||||||
|
print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
|
||||||
|
metrics.request_throughput))
|
||||||
|
print("{:<40} {:<10.2f}".format("Input token throughput (tok/s):",
|
||||||
|
metrics.input_throughput))
|
||||||
|
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
|
||||||
|
metrics.output_throughput))
|
||||||
|
print("{s:{c}^{n}}".format(s='Time to First Token', n=50, c='-'))
|
||||||
|
print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
|
||||||
|
print("{:<40} {:<10.2f}".format("Median TTFT (ms):",
|
||||||
|
metrics.median_ttft_ms))
|
||||||
|
print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms))
|
||||||
|
print("{s:{c}^{n}}".format(s='Time per Output Token (excl. 1st token)',
|
||||||
|
n=50,
|
||||||
|
c='-'))
|
||||||
|
print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms))
|
||||||
|
print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
|
||||||
|
metrics.median_tpot_ms))
|
||||||
|
print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
|
||||||
|
print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-'))
|
||||||
|
print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
|
||||||
|
print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
|
||||||
|
print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
|
||||||
|
print("=" * 50)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"duration": benchmark_duration,
|
||||||
|
"completed": metrics.completed,
|
||||||
|
"total_input_tokens": metrics.total_input,
|
||||||
|
"total_output_tokens": metrics.total_output,
|
||||||
|
"request_throughput": metrics.request_throughput,
|
||||||
|
"input_throughput": metrics.input_throughput,
|
||||||
|
"output_throughput": metrics.output_throughput,
|
||||||
|
"mean_ttft_ms": metrics.mean_ttft_ms,
|
||||||
|
"median_ttft_ms": metrics.median_ttft_ms,
|
||||||
|
"p99_ttft_ms": metrics.p99_ttft_ms,
|
||||||
|
"mean_tpot_ms": metrics.mean_tpot_ms,
|
||||||
|
"median_tpot_ms": metrics.median_tpot_ms,
|
||||||
|
"p99_tpot_ms": metrics.p99_tpot_ms,
|
||||||
|
"mean_itl_ms": metrics.mean_itl_ms,
|
||||||
|
"median_itl_ms": metrics.median_itl_ms,
|
||||||
|
"p99_itl_ms": metrics.p99_itl_ms,
|
||||||
|
"input_lens": [output.prompt_len for output in outputs],
|
||||||
|
"output_lens": actual_output_lens,
|
||||||
|
"ttfts": [output.ttft for output in outputs],
|
||||||
|
"itls": [output.itl for output in outputs],
|
||||||
|
"generated_texts": [output.generated_text for output in outputs],
|
||||||
|
"errors": [output.error for output in outputs],
|
||||||
|
}
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
def main(args: argparse.Namespace):
|
def main(args: argparse.Namespace):
|
||||||
@ -176,58 +385,260 @@ def main(args: argparse.Namespace):
|
|||||||
random.seed(args.seed)
|
random.seed(args.seed)
|
||||||
np.random.seed(args.seed)
|
np.random.seed(args.seed)
|
||||||
|
|
||||||
api_url = f"http://{args.host}:{args.port}/generate"
|
backend = args.backend
|
||||||
tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
|
model_id = args.model
|
||||||
input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
|
tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
|
||||||
|
|
||||||
benchmark_start_time = time.perf_counter()
|
if args.base_url is not None:
|
||||||
asyncio.run(benchmark(args.backend, api_url, input_requests, args.best_of,
|
api_url = f"{args.base_url}{args.endpoint}"
|
||||||
args.use_beam_search, args.request_rate))
|
else:
|
||||||
benchmark_end_time = time.perf_counter()
|
api_url = f"http://{args.host}:{args.port}{args.endpoint}"
|
||||||
benchmark_time = benchmark_end_time - benchmark_start_time
|
|
||||||
print(f"Total time: {benchmark_time:.2f} s")
|
|
||||||
print(f"Throughput: {args.num_prompts / benchmark_time:.2f} requests/s")
|
|
||||||
|
|
||||||
# Compute the latency statistics.
|
tokenizer = get_tokenizer(tokenizer_id,
|
||||||
avg_latency = np.mean([latency for _, _, latency in REQUEST_LATENCY])
|
trust_remote_code=args.trust_remote_code)
|
||||||
print(f"Average latency: {avg_latency:.2f} s")
|
|
||||||
avg_per_token_latency = np.mean([
|
if args.dataset is not None:
|
||||||
latency / (prompt_len + output_len)
|
warnings.warn(
|
||||||
for prompt_len, output_len, latency in REQUEST_LATENCY
|
"The '--dataset' argument will be deprecated in the next "
|
||||||
])
|
"release. Please use '--dataset-name' and "
|
||||||
print(f"Average latency per token: {avg_per_token_latency:.2f} s")
|
"'--dataset-path' in the future runs.",
|
||||||
avg_per_output_token_latency = np.mean([
|
stacklevel=2)
|
||||||
latency / output_len
|
input_requests = sample_sharegpt_requests(
|
||||||
for _, output_len, latency in REQUEST_LATENCY
|
dataset_path=args.dataset,
|
||||||
])
|
num_requests=args.num_prompts,
|
||||||
print("Average latency per output token: "
|
tokenizer=tokenizer,
|
||||||
f"{avg_per_output_token_latency:.2f} s")
|
fixed_output_len=args.sharegpt_output_len,
|
||||||
|
)
|
||||||
|
|
||||||
|
elif args.dataset_name == "sharegpt":
|
||||||
|
input_requests = sample_sharegpt_requests(
|
||||||
|
dataset_path=args.dataset_path,
|
||||||
|
num_requests=args.num_prompts,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
fixed_output_len=args.sharegpt_output_len,
|
||||||
|
)
|
||||||
|
|
||||||
|
elif args.dataset_name == "sonnet":
|
||||||
|
# Do not format the prompt, pass to message directly
|
||||||
|
if args.backend == "openai-chat":
|
||||||
|
input_requests = sample_sonnet_requests(
|
||||||
|
dataset_path=args.dataset_path,
|
||||||
|
num_requests=args.num_prompts,
|
||||||
|
input_len=args.sonnet_input_len,
|
||||||
|
output_len=args.sonnet_output_len,
|
||||||
|
prefix_len=args.sonnet_prefix_len,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
)
|
||||||
|
input_requests = [(prompt, prompt_len, output_len)
|
||||||
|
for prompt, prompt_formatted, prompt_len,
|
||||||
|
output_len in input_requests]
|
||||||
|
else:
|
||||||
|
assert (
|
||||||
|
tokenizer.chat_template or tokenizer.default_chat_template
|
||||||
|
), "Tokenizer/model must have chat template for sonnet dataset."
|
||||||
|
input_requests = sample_sonnet_requests(
|
||||||
|
dataset_path=args.dataset_path,
|
||||||
|
num_requests=args.num_prompts,
|
||||||
|
input_len=args.sonnet_input_len,
|
||||||
|
output_len=args.sonnet_output_len,
|
||||||
|
prefix_len=args.sonnet_prefix_len,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
)
|
||||||
|
input_requests = [(prompt_formatted, prompt_len, output_len)
|
||||||
|
for prompt, prompt_formatted, prompt_len,
|
||||||
|
output_len in input_requests]
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unknown dataset: {args.dataset_name}")
|
||||||
|
|
||||||
|
benchmark_result = asyncio.run(
|
||||||
|
benchmark(
|
||||||
|
backend=backend,
|
||||||
|
api_url=api_url,
|
||||||
|
model_id=model_id,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
input_requests=input_requests,
|
||||||
|
best_of=args.best_of,
|
||||||
|
use_beam_search=args.use_beam_search,
|
||||||
|
request_rate=args.request_rate,
|
||||||
|
disable_tqdm=args.disable_tqdm,
|
||||||
|
))
|
||||||
|
|
||||||
|
# Save config and results to json
|
||||||
|
if args.save_result:
|
||||||
|
result_json = {}
|
||||||
|
|
||||||
|
# Setup
|
||||||
|
current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
|
||||||
|
result_json["date"] = current_dt
|
||||||
|
result_json["backend"] = backend
|
||||||
|
result_json["model_id"] = model_id
|
||||||
|
result_json["tokenizer_id"] = tokenizer_id
|
||||||
|
result_json["best_of"] = args.best_of
|
||||||
|
result_json["use_beam_search"] = args.use_beam_search
|
||||||
|
result_json["num_prompts"] = args.num_prompts
|
||||||
|
|
||||||
|
# Metadata
|
||||||
|
if args.metadata:
|
||||||
|
for item in args.metadata:
|
||||||
|
if "=" in item:
|
||||||
|
kvstring = item.split("=")
|
||||||
|
result_json[kvstring[0].strip()] = kvstring[1].strip()
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"Invalid metadata format. Please use KEY=VALUE format."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Traffic
|
||||||
|
result_json["request_rate"] = (
|
||||||
|
args.request_rate if args.request_rate < float("inf") else "inf")
|
||||||
|
|
||||||
|
# Merge with benchmark result
|
||||||
|
result_json = {**result_json, **benchmark_result}
|
||||||
|
|
||||||
|
# Save to file
|
||||||
|
base_model_id = model_id.split("/")[-1]
|
||||||
|
file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa
|
||||||
|
if args.result_dir:
|
||||||
|
file_name = os.path.join(args.result_dir, file_name)
|
||||||
|
with open(file_name, "w") as outfile:
|
||||||
|
json.dump(result_json, outfile)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Benchmark the online serving throughput.")
|
description="Benchmark the online serving throughput.")
|
||||||
parser.add_argument("--backend", type=str, default="vllm",
|
parser.add_argument(
|
||||||
choices=["vllm", "tgi"])
|
"--backend",
|
||||||
|
type=str,
|
||||||
|
default="vllm",
|
||||||
|
choices=list(ASYNC_REQUEST_FUNCS.keys()),
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--base-url",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Server or API base url if not using http host and port.",
|
||||||
|
)
|
||||||
parser.add_argument("--host", type=str, default="localhost")
|
parser.add_argument("--host", type=str, default="localhost")
|
||||||
parser.add_argument("--port", type=int, default=8000)
|
parser.add_argument("--port", type=int, default=8000)
|
||||||
parser.add_argument("--dataset", type=str, required=True,
|
parser.add_argument(
|
||||||
|
"--endpoint",
|
||||||
|
type=str,
|
||||||
|
default="/v1/completions",
|
||||||
|
help="API endpoint.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--dataset",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Path to the ShareGPT dataset, will be deprecated in the "
|
||||||
|
"next release.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--dataset-name",
|
||||||
|
type=str,
|
||||||
|
default="sharegpt",
|
||||||
|
choices=["sharegpt", "sonnet"],
|
||||||
|
help="Name of the dataset to benchmark on.",
|
||||||
|
)
|
||||||
|
parser.add_argument("--dataset-path",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
help="Path to the dataset.")
|
help="Path to the dataset.")
|
||||||
parser.add_argument("--tokenizer", type=str, required=True,
|
parser.add_argument(
|
||||||
help="Name or path of the tokenizer.")
|
"--model",
|
||||||
parser.add_argument("--best-of", type=int, default=1,
|
type=str,
|
||||||
help="Generates `best_of` sequences per prompt and "
|
required=True,
|
||||||
"returns the best one.")
|
help="Name of the model.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--tokenizer",
|
||||||
|
type=str,
|
||||||
|
help=
|
||||||
|
"Name or path of the tokenizer, if not using the default tokenizer.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--best-of",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
help="Generates `best_of` sequences per prompt and "
|
||||||
|
"returns the best one.",
|
||||||
|
)
|
||||||
parser.add_argument("--use-beam-search", action="store_true")
|
parser.add_argument("--use-beam-search", action="store_true")
|
||||||
parser.add_argument("--num-prompts", type=int, default=1000,
|
parser.add_argument(
|
||||||
help="Number of prompts to process.")
|
"--num-prompts",
|
||||||
parser.add_argument("--request-rate", type=float, default=float("inf"),
|
type=int,
|
||||||
help="Number of requests per second. If this is inf, "
|
default=1000,
|
||||||
"then all the requests are sent at time 0. "
|
help="Number of prompts to process.",
|
||||||
"Otherwise, we use Poisson process to synthesize "
|
)
|
||||||
"the request arrival times.")
|
parser.add_argument(
|
||||||
|
"--sharegpt-output-len",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Output length for each request. Overrides the output length "
|
||||||
|
"from the ShareGPT dataset.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--sonnet-input-len",
|
||||||
|
type=int,
|
||||||
|
default=550,
|
||||||
|
help=
|
||||||
|
"Number of input tokens per request, used only for sonnet dataset.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--sonnet-output-len",
|
||||||
|
type=int,
|
||||||
|
default=150,
|
||||||
|
help=
|
||||||
|
"Number of output tokens per request, used only for sonnet dataset.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--sonnet-prefix-len",
|
||||||
|
type=int,
|
||||||
|
default=200,
|
||||||
|
help=
|
||||||
|
"Number of prefix tokens per request, used only for sonnet dataset.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--request-rate",
|
||||||
|
type=float,
|
||||||
|
default=float("inf"),
|
||||||
|
help="Number of requests per second. If this is inf, "
|
||||||
|
"then all the requests are sent at time 0. "
|
||||||
|
"Otherwise, we use Poisson process to synthesize "
|
||||||
|
"the request arrival times.",
|
||||||
|
)
|
||||||
parser.add_argument("--seed", type=int, default=0)
|
parser.add_argument("--seed", type=int, default=0)
|
||||||
parser.add_argument('--trust-remote-code', action='store_true',
|
parser.add_argument(
|
||||||
help='trust remote code from huggingface')
|
"--trust-remote-code",
|
||||||
|
action="store_true",
|
||||||
|
help="Trust remote code from huggingface",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--disable-tqdm",
|
||||||
|
action="store_true",
|
||||||
|
help="Specify to disable tqdm progress bar.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--save-result",
|
||||||
|
action="store_true",
|
||||||
|
help="Specify to save benchmark results to a json file",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--metadata",
|
||||||
|
metavar="KEY=VALUE",
|
||||||
|
nargs="*",
|
||||||
|
help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) "
|
||||||
|
"for metadata of this run to be saved in the result JSON file "
|
||||||
|
"for record keeping purposes.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--result-dir",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help="Specify directory to save benchmark json results."
|
||||||
|
"If not specified, results are saved in the current directory.",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
main(args)
|
main(args)
|
||||||
|
|||||||
@ -6,9 +6,11 @@ import time
|
|||||||
from typing import List, Optional, Tuple
|
from typing import List, Optional, Tuple
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
from tqdm import tqdm
|
||||||
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
from transformers import (AutoModelForCausalLM, AutoTokenizer,
|
||||||
PreTrainedTokenizerBase)
|
PreTrainedTokenizerBase)
|
||||||
from tqdm import tqdm
|
|
||||||
|
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||||
|
|
||||||
|
|
||||||
def sample_requests(
|
def sample_requests(
|
||||||
@ -17,9 +19,8 @@ def sample_requests(
|
|||||||
tokenizer: PreTrainedTokenizerBase,
|
tokenizer: PreTrainedTokenizerBase,
|
||||||
fixed_output_len: Optional[int],
|
fixed_output_len: Optional[int],
|
||||||
) -> List[Tuple[str, int, int]]:
|
) -> List[Tuple[str, int, int]]:
|
||||||
if fixed_output_len is not None:
|
if fixed_output_len is not None and fixed_output_len < 4:
|
||||||
if fixed_output_len < 4:
|
raise ValueError("output_len too small")
|
||||||
raise ValueError("output_len too small")
|
|
||||||
|
|
||||||
# Load the dataset.
|
# Load the dataset.
|
||||||
with open(dataset_path) as f:
|
with open(dataset_path) as f:
|
||||||
@ -30,22 +31,23 @@ def sample_requests(
|
|||||||
dataset = [(data["conversations"][0]["value"],
|
dataset = [(data["conversations"][0]["value"],
|
||||||
data["conversations"][1]["value"]) for data in dataset]
|
data["conversations"][1]["value"]) for data in dataset]
|
||||||
|
|
||||||
# Tokenize the prompts and completions.
|
# Shuffle the dataset.
|
||||||
prompts = [prompt for prompt, _ in dataset]
|
random.shuffle(dataset)
|
||||||
prompt_token_ids = tokenizer(prompts).input_ids
|
|
||||||
completions = [completion for _, completion in dataset]
|
|
||||||
completion_token_ids = tokenizer(completions).input_ids
|
|
||||||
tokenized_dataset = []
|
|
||||||
for i in range(len(dataset)):
|
|
||||||
output_len = len(completion_token_ids[i])
|
|
||||||
if fixed_output_len is not None:
|
|
||||||
output_len = fixed_output_len
|
|
||||||
tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
|
|
||||||
|
|
||||||
# Filter out too long sequences.
|
# Filter out sequences that are too long or too short
|
||||||
filtered_dataset: List[Tuple[str, int, int]] = []
|
filtered_dataset: List[Tuple[str, int, int]] = []
|
||||||
for prompt, prompt_token_ids, output_len in tokenized_dataset:
|
for i in range(len(dataset)):
|
||||||
|
if len(filtered_dataset) == num_requests:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Tokenize the prompts and completions.
|
||||||
|
prompt = dataset[i][0]
|
||||||
|
prompt_token_ids = tokenizer(prompt).input_ids
|
||||||
|
completion = dataset[i][1]
|
||||||
|
completion_token_ids = tokenizer(completion).input_ids
|
||||||
prompt_len = len(prompt_token_ids)
|
prompt_len = len(prompt_token_ids)
|
||||||
|
output_len = len(completion_token_ids
|
||||||
|
) if fixed_output_len is None else fixed_output_len
|
||||||
if prompt_len < 4 or output_len < 4:
|
if prompt_len < 4 or output_len < 4:
|
||||||
# Prune too short sequences.
|
# Prune too short sequences.
|
||||||
continue
|
continue
|
||||||
@ -54,9 +56,7 @@ def sample_requests(
|
|||||||
continue
|
continue
|
||||||
filtered_dataset.append((prompt, prompt_len, output_len))
|
filtered_dataset.append((prompt, prompt_len, output_len))
|
||||||
|
|
||||||
# Sample the requests.
|
return filtered_dataset
|
||||||
sampled_requests = random.sample(filtered_dataset, num_requests)
|
|
||||||
return sampled_requests
|
|
||||||
|
|
||||||
|
|
||||||
def run_vllm(
|
def run_vllm(
|
||||||
@ -70,6 +70,17 @@ def run_vllm(
|
|||||||
use_beam_search: bool,
|
use_beam_search: bool,
|
||||||
trust_remote_code: bool,
|
trust_remote_code: bool,
|
||||||
dtype: str,
|
dtype: str,
|
||||||
|
max_model_len: Optional[int],
|
||||||
|
enforce_eager: bool,
|
||||||
|
kv_cache_dtype: str,
|
||||||
|
quantization_param_path: Optional[str],
|
||||||
|
device: str,
|
||||||
|
enable_prefix_caching: bool,
|
||||||
|
enable_chunked_prefill: bool,
|
||||||
|
max_num_batched_tokens: int,
|
||||||
|
distributed_executor_backend: Optional[str],
|
||||||
|
gpu_memory_utilization: float = 0.9,
|
||||||
|
download_dir: Optional[str] = None,
|
||||||
) -> float:
|
) -> float:
|
||||||
from vllm import LLM, SamplingParams
|
from vllm import LLM, SamplingParams
|
||||||
llm = LLM(
|
llm = LLM(
|
||||||
@ -80,28 +91,36 @@ def run_vllm(
|
|||||||
seed=seed,
|
seed=seed,
|
||||||
trust_remote_code=trust_remote_code,
|
trust_remote_code=trust_remote_code,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
|
max_model_len=max_model_len,
|
||||||
|
gpu_memory_utilization=gpu_memory_utilization,
|
||||||
|
enforce_eager=enforce_eager,
|
||||||
|
kv_cache_dtype=kv_cache_dtype,
|
||||||
|
quantization_param_path=quantization_param_path,
|
||||||
|
device=device,
|
||||||
|
enable_prefix_caching=enable_prefix_caching,
|
||||||
|
download_dir=download_dir,
|
||||||
|
enable_chunked_prefill=enable_chunked_prefill,
|
||||||
|
max_num_batched_tokens=max_num_batched_tokens,
|
||||||
|
distributed_executor_backend=distributed_executor_backend,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Add the requests to the engine.
|
# Add the requests to the engine.
|
||||||
|
prompts = []
|
||||||
|
sampling_params = []
|
||||||
for prompt, _, output_len in requests:
|
for prompt, _, output_len in requests:
|
||||||
sampling_params = SamplingParams(
|
prompts.append(prompt)
|
||||||
n=n,
|
sampling_params.append(
|
||||||
temperature=0.0 if use_beam_search else 1.0,
|
SamplingParams(
|
||||||
top_p=1.0,
|
n=n,
|
||||||
use_beam_search=use_beam_search,
|
temperature=0.0 if use_beam_search else 1.0,
|
||||||
ignore_eos=True,
|
top_p=1.0,
|
||||||
max_tokens=output_len,
|
use_beam_search=use_beam_search,
|
||||||
)
|
ignore_eos=True,
|
||||||
# FIXME(woosuk): Do not use internal method.
|
max_tokens=output_len,
|
||||||
llm._add_request(
|
))
|
||||||
prompt=prompt,
|
|
||||||
prompt_token_ids=None,
|
|
||||||
sampling_params=sampling_params,
|
|
||||||
)
|
|
||||||
|
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
# FIXME(woosuk): Do not use internal method.
|
llm.generate(prompts, sampling_params, use_tqdm=True)
|
||||||
llm._run_engine(use_tqdm=True)
|
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
return end - start
|
return end - start
|
||||||
|
|
||||||
@ -172,13 +191,15 @@ def run_mii(
|
|||||||
tensor_parallel_size: int,
|
tensor_parallel_size: int,
|
||||||
output_len: int,
|
output_len: int,
|
||||||
) -> float:
|
) -> float:
|
||||||
from mii import pipeline
|
from mii import client, serve
|
||||||
llm = pipeline(model, tensor_parallel=tensor_parallel_size)
|
llm = serve(model, tensor_parallel=tensor_parallel_size)
|
||||||
prompts = [prompt for prompt, _, _ in requests]
|
prompts = [prompt for prompt, _, _ in requests]
|
||||||
|
|
||||||
start = time.perf_counter()
|
start = time.perf_counter()
|
||||||
llm(prompts, max_new_tokens=output_len)
|
llm.generate(prompts, max_new_tokens=output_len)
|
||||||
end = time.perf_counter()
|
end = time.perf_counter()
|
||||||
|
client = client(model)
|
||||||
|
client.terminate_server()
|
||||||
return end - start
|
return end - start
|
||||||
|
|
||||||
|
|
||||||
@ -199,10 +220,15 @@ def main(args: argparse.Namespace):
|
|||||||
args.output_len)
|
args.output_len)
|
||||||
|
|
||||||
if args.backend == "vllm":
|
if args.backend == "vllm":
|
||||||
elapsed_time = run_vllm(requests, args.model, args.tokenizer,
|
elapsed_time = run_vllm(
|
||||||
args.quantization, args.tensor_parallel_size,
|
requests, args.model, args.tokenizer, args.quantization,
|
||||||
args.seed, args.n, args.use_beam_search,
|
args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
|
||||||
args.trust_remote_code, args.dtype)
|
args.trust_remote_code, args.dtype, args.max_model_len,
|
||||||
|
args.enforce_eager, args.kv_cache_dtype,
|
||||||
|
args.quantization_param_path, args.device,
|
||||||
|
args.enable_prefix_caching, args.enable_chunked_prefill,
|
||||||
|
args.max_num_batched_tokens, args.distributed_executor_backend,
|
||||||
|
args.gpu_memory_utilization, args.download_dir)
|
||||||
elif args.backend == "hf":
|
elif args.backend == "hf":
|
||||||
assert args.tensor_parallel_size == 1
|
assert args.tensor_parallel_size == 1
|
||||||
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
|
||||||
@ -218,6 +244,18 @@ def main(args: argparse.Namespace):
|
|||||||
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
|
||||||
f"{total_num_tokens / elapsed_time:.2f} tokens/s")
|
f"{total_num_tokens / elapsed_time:.2f} tokens/s")
|
||||||
|
|
||||||
|
# Output JSON results if specified
|
||||||
|
if args.output_json:
|
||||||
|
results = {
|
||||||
|
"elapsed_time": elapsed_time,
|
||||||
|
"num_requests": len(requests),
|
||||||
|
"total_num_tokens": total_num_tokens,
|
||||||
|
"requests_per_second": len(requests) / elapsed_time,
|
||||||
|
"tokens_per_second": total_num_tokens / elapsed_time,
|
||||||
|
}
|
||||||
|
with open(args.output_json, "w") as f:
|
||||||
|
json.dump(results, f, indent=4)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
parser = argparse.ArgumentParser(description="Benchmark the throughput.")
|
parser = argparse.ArgumentParser(description="Benchmark the throughput.")
|
||||||
@ -242,7 +280,7 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument("--tokenizer", type=str, default=None)
|
parser.add_argument("--tokenizer", type=str, default=None)
|
||||||
parser.add_argument('--quantization',
|
parser.add_argument('--quantization',
|
||||||
'-q',
|
'-q',
|
||||||
choices=['awq', 'squeezellm', None],
|
choices=[*QUANTIZATION_METHODS, None],
|
||||||
default=None)
|
default=None)
|
||||||
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
|
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
|
||||||
parser.add_argument("--n",
|
parser.add_argument("--n",
|
||||||
@ -262,6 +300,12 @@ if __name__ == "__main__":
|
|||||||
parser.add_argument('--trust-remote-code',
|
parser.add_argument('--trust-remote-code',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
help='trust remote code from huggingface')
|
help='trust remote code from huggingface')
|
||||||
|
parser.add_argument(
|
||||||
|
'--max-model-len',
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help='Maximum length of a sequence (including prompt and output). '
|
||||||
|
'If None, will be derived from the model.')
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--dtype',
|
'--dtype',
|
||||||
type=str,
|
type=str,
|
||||||
@ -271,6 +315,68 @@ if __name__ == "__main__":
|
|||||||
'The "auto" option will use FP16 precision '
|
'The "auto" option will use FP16 precision '
|
||||||
'for FP32 and FP16 models, and BF16 precision '
|
'for FP32 and FP16 models, and BF16 precision '
|
||||||
'for BF16 models.')
|
'for BF16 models.')
|
||||||
|
parser.add_argument('--gpu-memory-utilization',
|
||||||
|
type=float,
|
||||||
|
default=0.9,
|
||||||
|
help='the fraction of GPU memory to be used for '
|
||||||
|
'the model executor, which can range from 0 to 1.'
|
||||||
|
'If unspecified, will use the default value of 0.9.')
|
||||||
|
parser.add_argument("--enforce-eager",
|
||||||
|
action="store_true",
|
||||||
|
help="enforce eager execution")
|
||||||
|
parser.add_argument(
|
||||||
|
'--kv-cache-dtype',
|
||||||
|
type=str,
|
||||||
|
choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
|
||||||
|
default="auto",
|
||||||
|
help='Data type for kv cache storage. If "auto", will use model '
|
||||||
|
'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
|
||||||
|
'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
|
||||||
|
parser.add_argument(
|
||||||
|
'--quantization-param-path',
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help='Path to the JSON file containing the KV cache scaling factors. '
|
||||||
|
'This should generally be supplied, when KV cache dtype is FP8. '
|
||||||
|
'Otherwise, KV cache scaling factors default to 1.0, which may cause '
|
||||||
|
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
|
||||||
|
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
|
||||||
|
'instead supported for common inference criteria.')
|
||||||
|
parser.add_argument(
|
||||||
|
"--device",
|
||||||
|
type=str,
|
||||||
|
default="cuda",
|
||||||
|
choices=["cuda", "cpu"],
|
||||||
|
help='device type for vLLM execution, supporting CUDA and CPU.')
|
||||||
|
parser.add_argument(
|
||||||
|
"--enable-prefix-caching",
|
||||||
|
action='store_true',
|
||||||
|
help="enable automatic prefix caching for vLLM backend.")
|
||||||
|
parser.add_argument("--enable-chunked-prefill",
|
||||||
|
action='store_true',
|
||||||
|
help="enable chunked prefill for vLLM backend.")
|
||||||
|
parser.add_argument('--max-num-batched-tokens',
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help='maximum number of batched tokens per '
|
||||||
|
'iteration')
|
||||||
|
parser.add_argument('--download-dir',
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help='directory to download and load the weights, '
|
||||||
|
'default to the default cache dir of huggingface')
|
||||||
|
parser.add_argument(
|
||||||
|
'--output-json',
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
help='Path to save the throughput results in JSON format.')
|
||||||
|
parser.add_argument(
|
||||||
|
'--distributed-executor-backend',
|
||||||
|
choices=['ray', 'mp'],
|
||||||
|
default=None,
|
||||||
|
help='Backend to use for distributed serving. When more than 1 GPU '
|
||||||
|
'is used, will be automatically set to "ray" if installed '
|
||||||
|
'or "mp" (multiprocessing) otherwise.')
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
if args.tokenizer is None:
|
if args.tokenizer is None:
|
||||||
args.tokenizer = args.model
|
args.tokenizer = args.model
|
||||||
|
|||||||
352
benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
Normal file
352
benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
Normal file
@ -0,0 +1,352 @@
|
|||||||
|
import argparse
|
||||||
|
import copy
|
||||||
|
import itertools
|
||||||
|
import pickle as pkl
|
||||||
|
import time
|
||||||
|
from typing import Callable, Iterable, List, Tuple
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.utils.benchmark as TBenchmark
|
||||||
|
from torch.utils.benchmark import Measurement as TMeasurement
|
||||||
|
from weight_shapes import WEIGHT_SHAPES
|
||||||
|
|
||||||
|
from vllm import _custom_ops as ops
|
||||||
|
|
||||||
|
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:]
|
||||||
|
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
||||||
|
DEFAULT_TP_SIZES = [1]
|
||||||
|
|
||||||
|
# helpers
|
||||||
|
|
||||||
|
|
||||||
|
def to_fp8(tensor: torch.tensor) -> torch.tensor:
|
||||||
|
finfo = torch.finfo(torch.float8_e4m3fn)
|
||||||
|
return torch.round(tensor.clamp(
|
||||||
|
min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
|
||||||
|
|
||||||
|
|
||||||
|
def to_int8(tensor: torch.tensor) -> torch.tensor:
|
||||||
|
return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
|
||||||
|
|
||||||
|
|
||||||
|
def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
|
||||||
|
k: int) -> Tuple[torch.tensor, torch.tensor]:
|
||||||
|
|
||||||
|
a = torch.randn((m, k), device='cuda') * 5
|
||||||
|
b = torch.randn((n, k), device='cuda').t() * 5
|
||||||
|
|
||||||
|
if dtype == torch.int8:
|
||||||
|
return to_int8(a), to_int8(b)
|
||||||
|
if dtype == torch.float8_e4m3fn:
|
||||||
|
return to_fp8(a), to_fp8(b)
|
||||||
|
|
||||||
|
raise ValueError("unsupported dtype")
|
||||||
|
|
||||||
|
|
||||||
|
# impl
|
||||||
|
|
||||||
|
|
||||||
|
def pytorch_i8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
|
||||||
|
scale_b: torch.tensor,
|
||||||
|
out_dtype: torch.dtype) -> torch.tensor:
|
||||||
|
return torch.mm(a, b)
|
||||||
|
|
||||||
|
|
||||||
|
def pytorch_fp8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
|
||||||
|
scale_b: torch.tensor,
|
||||||
|
out_dtype: torch.dtype) -> torch.tensor:
|
||||||
|
return torch._scaled_mm(a,
|
||||||
|
b,
|
||||||
|
scale_a=scale_a,
|
||||||
|
scale_b=scale_b,
|
||||||
|
out_dtype=out_dtype)
|
||||||
|
|
||||||
|
|
||||||
|
def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor,
|
||||||
|
scale_a: torch.tensor, scale_b: torch.tensor,
|
||||||
|
out_dtype: torch.dtype) -> torch.tensor:
|
||||||
|
return torch._scaled_mm(a,
|
||||||
|
b,
|
||||||
|
scale_a=scale_a,
|
||||||
|
scale_b=scale_b,
|
||||||
|
out_dtype=out_dtype,
|
||||||
|
use_fast_accum=True)
|
||||||
|
|
||||||
|
|
||||||
|
def cutlass_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
|
||||||
|
scale_b: torch.tensor,
|
||||||
|
out_dtype: torch.dtype) -> torch.tensor:
|
||||||
|
return ops.cutlass_scaled_mm_dq(a,
|
||||||
|
b,
|
||||||
|
scale_a,
|
||||||
|
scale_b,
|
||||||
|
out_dtype=out_dtype)
|
||||||
|
|
||||||
|
|
||||||
|
# bench
|
||||||
|
def bench_fn(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
|
||||||
|
scale_b: torch.tensor, out_dtype: torch.dtype, label: str,
|
||||||
|
sub_label: str, fn: Callable, description: str) -> TMeasurement:
|
||||||
|
|
||||||
|
min_run_time = 1
|
||||||
|
|
||||||
|
globals = {
|
||||||
|
"a": a,
|
||||||
|
"b": b,
|
||||||
|
"scale_a": scale_a,
|
||||||
|
"scale_b": scale_b,
|
||||||
|
"out_dtype": out_dtype,
|
||||||
|
"fn": fn,
|
||||||
|
}
|
||||||
|
return TBenchmark.Timer(
|
||||||
|
stmt="fn(a, b, scale_a, scale_b, out_dtype)",
|
||||||
|
globals=globals,
|
||||||
|
label=label,
|
||||||
|
sub_label=sub_label,
|
||||||
|
description=description,
|
||||||
|
).blocked_autorange(min_run_time=min_run_time)
|
||||||
|
|
||||||
|
|
||||||
|
def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
||||||
|
sub_label: str) -> Iterable[TMeasurement]:
|
||||||
|
assert dtype == torch.int8
|
||||||
|
a, b = make_rand_tensors(torch.int8, m, n, k)
|
||||||
|
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
|
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
|
|
||||||
|
timers = []
|
||||||
|
# pytorch impl
|
||||||
|
timers.append(
|
||||||
|
bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
|
||||||
|
b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
|
||||||
|
torch.bfloat16, label, sub_label, pytorch_i8_impl,
|
||||||
|
"pytorch_bf16_bf16_bf16_matmul-no-scales"))
|
||||||
|
|
||||||
|
# cutlass impl
|
||||||
|
timers.append(
|
||||||
|
bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
|
||||||
|
torch.bfloat16, label, sub_label, cutlass_impl,
|
||||||
|
"cutlass_i8_i8_bf16_scaled_mm"))
|
||||||
|
|
||||||
|
return timers
|
||||||
|
|
||||||
|
|
||||||
|
def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
||||||
|
sub_label: str) -> Iterable[TMeasurement]:
|
||||||
|
assert dtype == torch.float8_e4m3fn
|
||||||
|
a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
|
||||||
|
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
|
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||||
|
|
||||||
|
timers = []
|
||||||
|
|
||||||
|
# pytorch impl: bf16 output, without fp8 fast accum
|
||||||
|
timers.append(
|
||||||
|
bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
|
||||||
|
pytorch_fp8_impl, "pytorch_fp8_fp8_bf16_scaled_mm"))
|
||||||
|
|
||||||
|
# pytorch impl: bf16 output, with fp8 fast accum
|
||||||
|
timers.append(
|
||||||
|
bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
|
||||||
|
pytorch_fp8_impl_fast_accum,
|
||||||
|
"pytorch_fp8_fp8_bf16_scaled_mm_fast_accum"))
|
||||||
|
|
||||||
|
# pytorch impl: fp16 output, without fp8 fast accum
|
||||||
|
timers.append(
|
||||||
|
bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
|
||||||
|
pytorch_fp8_impl, "pytorch_fp8_fp8_fp16_scaled_mm"))
|
||||||
|
|
||||||
|
# pytorch impl: fp16 output, with fp8 fast accum
|
||||||
|
timers.append(
|
||||||
|
bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
|
||||||
|
pytorch_fp8_impl_fast_accum,
|
||||||
|
"pytorch_fp8_fp8_fp16_scaled_mm_fast_accum"))
|
||||||
|
|
||||||
|
# cutlass impl: bf16 output
|
||||||
|
timers.append(
|
||||||
|
bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
|
||||||
|
torch.bfloat16, label, sub_label, cutlass_impl,
|
||||||
|
"cutlass_fp8_fp8_bf16_scaled_mm"))
|
||||||
|
# cutlass impl: fp16 output
|
||||||
|
timers.append(
|
||||||
|
bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
|
||||||
|
torch.float16, label, sub_label, cutlass_impl,
|
||||||
|
"cutlass_fp8_fp8_fp16_scaled_mm"))
|
||||||
|
return timers
|
||||||
|
|
||||||
|
|
||||||
|
def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
|
||||||
|
sub_label: str) -> Iterable[TMeasurement]:
|
||||||
|
if dtype == torch.int8:
|
||||||
|
return bench_int8(dtype, m, k, n, label, sub_label)
|
||||||
|
if dtype == torch.float8_e4m3fn:
|
||||||
|
return bench_fp8(dtype, m, k, n, label, sub_label)
|
||||||
|
raise ValueError("unsupported type")
|
||||||
|
|
||||||
|
|
||||||
|
# runner
|
||||||
|
def print_timers(timers: Iterable[TMeasurement]):
|
||||||
|
compare = TBenchmark.Compare(timers)
|
||||||
|
compare.print()
|
||||||
|
|
||||||
|
|
||||||
|
def run(dtype: torch.dtype,
|
||||||
|
MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
|
||||||
|
|
||||||
|
results = []
|
||||||
|
for m, k, n in MKNs:
|
||||||
|
timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
|
||||||
|
f"MKN=({m}x{k}x{n})")
|
||||||
|
print_timers(timers)
|
||||||
|
results.extend(timers)
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
# output makers
|
||||||
|
def make_output(data: Iterable[TMeasurement],
|
||||||
|
MKNs: Iterable[Tuple[int, int, int]],
|
||||||
|
base_description: str,
|
||||||
|
timestamp=None):
|
||||||
|
|
||||||
|
print(f"== All Results {base_description} ====")
|
||||||
|
print_timers(data)
|
||||||
|
|
||||||
|
# pickle all the results
|
||||||
|
timestamp = int(time.time()) if timestamp is None else timestamp
|
||||||
|
with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
|
||||||
|
pkl.dump(data, f)
|
||||||
|
|
||||||
|
|
||||||
|
# argparse runners
|
||||||
|
|
||||||
|
|
||||||
|
def run_square_bench(args):
|
||||||
|
dim_sizes = list(
|
||||||
|
range(args.dim_start, args.dim_end + 1, args.dim_increment))
|
||||||
|
MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
|
||||||
|
data = run(args.dtype, MKNs)
|
||||||
|
|
||||||
|
make_output(data, MKNs, f"square_bench-{args.dtype}")
|
||||||
|
|
||||||
|
|
||||||
|
def run_range_bench(args):
|
||||||
|
dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
|
||||||
|
n = len(dim_sizes)
|
||||||
|
Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
|
||||||
|
Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
|
||||||
|
Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
|
||||||
|
MKNs = list(zip(Ms, Ks, Ns))
|
||||||
|
data = run(args.dtype, MKNs)
|
||||||
|
|
||||||
|
make_output(data, MKNs, f"range_bench-{args.dtype}")
|
||||||
|
|
||||||
|
|
||||||
|
def run_model_bench(args):
|
||||||
|
|
||||||
|
print("Benchmarking models:")
|
||||||
|
for i, model in enumerate(args.models):
|
||||||
|
print(f"[{i}] {model}")
|
||||||
|
|
||||||
|
def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
|
||||||
|
KNs = []
|
||||||
|
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
|
||||||
|
KN[tp_split_dim] = KN[tp_split_dim] // tp_size
|
||||||
|
KNs.append(KN)
|
||||||
|
return KNs
|
||||||
|
|
||||||
|
model_bench_data = []
|
||||||
|
models_tps = list(itertools.product(args.models, args.tp_sizes))
|
||||||
|
for model, tp_size in models_tps:
|
||||||
|
Ms = args.batch_sizes
|
||||||
|
KNs = model_shapes(model, tp_size)
|
||||||
|
MKNs = []
|
||||||
|
for m in Ms:
|
||||||
|
for k, n in KNs:
|
||||||
|
MKNs.append((m, k, n))
|
||||||
|
|
||||||
|
data = run(args.dtype, MKNs)
|
||||||
|
model_bench_data.append(data)
|
||||||
|
|
||||||
|
# Print all results
|
||||||
|
for data, model_tp in zip(model_bench_data, models_tps):
|
||||||
|
model, tp_size = model_tp
|
||||||
|
print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
|
||||||
|
print_timers(data)
|
||||||
|
|
||||||
|
timestamp = int(time.time())
|
||||||
|
|
||||||
|
all_data = []
|
||||||
|
for d in model_bench_data:
|
||||||
|
all_data.extend(d)
|
||||||
|
# pickle all data
|
||||||
|
with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
|
||||||
|
pkl.dump(all_data, f)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
def to_torch_dtype(dt):
|
||||||
|
if dt == "int8":
|
||||||
|
return torch.int8
|
||||||
|
if dt == "fp8":
|
||||||
|
return torch.float8_e4m3fn
|
||||||
|
raise ValueError("unsupported dtype")
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="""
|
||||||
|
Benchmark Cutlass GEMM.
|
||||||
|
|
||||||
|
To run square GEMMs:
|
||||||
|
python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
|
||||||
|
|
||||||
|
To run constant N and K and sweep M:
|
||||||
|
python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
|
||||||
|
|
||||||
|
To run dimensions from a model:
|
||||||
|
python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
|
||||||
|
|
||||||
|
Output:
|
||||||
|
- a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
|
||||||
|
""", # noqa: E501
|
||||||
|
formatter_class=argparse.RawTextHelpFormatter)
|
||||||
|
|
||||||
|
parser.add_argument("--dtype",
|
||||||
|
type=to_torch_dtype,
|
||||||
|
required=True,
|
||||||
|
help="Available options are ['int8', 'fp8']")
|
||||||
|
subparsers = parser.add_subparsers(dest="cmd")
|
||||||
|
|
||||||
|
square_parser = subparsers.add_parser("square_bench")
|
||||||
|
square_parser.add_argument("--dim-start", type=int, required=True)
|
||||||
|
square_parser.add_argument("--dim-end", type=int, required=True)
|
||||||
|
square_parser.add_argument("--dim-increment", type=int, required=True)
|
||||||
|
square_parser.set_defaults(func=run_square_bench)
|
||||||
|
|
||||||
|
range_parser = subparsers.add_parser("range_bench")
|
||||||
|
range_parser.add_argument("--dim-start", type=int, required=True)
|
||||||
|
range_parser.add_argument("--dim-end", type=int, required=True)
|
||||||
|
range_parser.add_argument("--dim-increment", type=int, required=True)
|
||||||
|
range_parser.add_argument("--m-constant", type=int, default=None)
|
||||||
|
range_parser.add_argument("--n-constant", type=int, default=None)
|
||||||
|
range_parser.add_argument("--k-constant", type=int, default=None)
|
||||||
|
range_parser.set_defaults(func=run_range_bench)
|
||||||
|
|
||||||
|
model_parser = subparsers.add_parser("model_bench")
|
||||||
|
model_parser.add_argument("--models",
|
||||||
|
nargs="+",
|
||||||
|
type=str,
|
||||||
|
default=DEFAULT_MODELS,
|
||||||
|
choices=WEIGHT_SHAPES.keys())
|
||||||
|
model_parser.add_argument("--tp-sizes",
|
||||||
|
nargs="+",
|
||||||
|
type=int,
|
||||||
|
default=DEFAULT_TP_SIZES)
|
||||||
|
model_parser.add_argument("--batch-sizes",
|
||||||
|
nargs="+",
|
||||||
|
type=int,
|
||||||
|
default=DEFAULT_BATCH_SIZES)
|
||||||
|
model_parser.set_defaults(func=run_model_bench)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
args.func(args)
|
||||||
37
benchmarks/cutlass_benchmarks/weight_shapes.py
Normal file
37
benchmarks/cutlass_benchmarks/weight_shapes.py
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
# Weight Shapes are in the format
|
||||||
|
# ([K, N], TP_SPLIT_DIM)
|
||||||
|
# Example:
|
||||||
|
# A shape of ([14336, 4096], 0) indicates the following GEMM shape,
|
||||||
|
# - TP1 : K = 14336, N = 4096
|
||||||
|
# - TP2 : K = 7168, N = 4096
|
||||||
|
# A shape of ([4096, 6144], 1) indicates the following GEMM shape,
|
||||||
|
# - TP1 : K = 4096, N = 6144
|
||||||
|
# - TP4 : K = 4096, N = 1536
|
||||||
|
|
||||||
|
# TP1 shapes
|
||||||
|
WEIGHT_SHAPES = {
|
||||||
|
"mistralai/Mistral-7B-v0.1": [
|
||||||
|
([4096, 6144], 1),
|
||||||
|
([4096, 4096], 0),
|
||||||
|
([4096, 28672], 1),
|
||||||
|
([14336, 4096], 0),
|
||||||
|
],
|
||||||
|
"meta-llama/Llama-2-7b-hf": [
|
||||||
|
([4096, 12288], 1),
|
||||||
|
([4096, 4096], 0),
|
||||||
|
([4096, 22016], 1),
|
||||||
|
([11008, 4096], 0),
|
||||||
|
],
|
||||||
|
"meta-llama/Llama-2-13b-hf": [
|
||||||
|
([5120, 15360], 1),
|
||||||
|
([5120, 5120], 0),
|
||||||
|
([5120, 27648], 1),
|
||||||
|
([13824, 5120], 0),
|
||||||
|
],
|
||||||
|
"meta-llama/Llama-2-70b-hf": [
|
||||||
|
([8192, 10240], 1),
|
||||||
|
([8192, 8192], 0),
|
||||||
|
([8192, 57344], 1),
|
||||||
|
([28672, 8192], 0),
|
||||||
|
],
|
||||||
|
}
|
||||||
302
benchmarks/kernels/benchmark_aqlm.py
Normal file
302
benchmarks/kernels/benchmark_aqlm.py
Normal file
@ -0,0 +1,302 @@
|
|||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.nn.functional as F
|
||||||
|
|
||||||
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm.model_executor.layers.quantization.aqlm import (
|
||||||
|
dequantize_weight, generic_dequantize_gemm, get_int_dtype,
|
||||||
|
optimized_dequantize_gemm)
|
||||||
|
|
||||||
|
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
|
||||||
|
|
||||||
|
|
||||||
|
def torch_mult(
|
||||||
|
input: torch.Tensor, # [..., in_features]
|
||||||
|
weights: torch.Tensor,
|
||||||
|
scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
|
||||||
|
) -> torch.Tensor:
|
||||||
|
output = F.linear(input, weights)
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
def dequant_out_scale(
|
||||||
|
input: torch.Tensor, # [..., in_features]
|
||||||
|
codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
|
||||||
|
codebooks: torch.
|
||||||
|
Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
|
||||||
|
scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
|
||||||
|
output_partition_sizes: torch.IntTensor,
|
||||||
|
bias: Optional[torch.Tensor],
|
||||||
|
) -> torch.Tensor:
|
||||||
|
|
||||||
|
weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
|
||||||
|
|
||||||
|
if bias is None:
|
||||||
|
output = F.linear(input, weights, bias)
|
||||||
|
orig_shape = output.shape
|
||||||
|
flattened_output = output.view(-1, output.size(-1))
|
||||||
|
f_scales = scales.view(-1, scales.shape[0])
|
||||||
|
b_scales = f_scales.expand(flattened_output.shape[0], -1)
|
||||||
|
flattened_output *= b_scales
|
||||||
|
return flattened_output.view(orig_shape)
|
||||||
|
else:
|
||||||
|
b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
|
||||||
|
-1, weights.shape[1])
|
||||||
|
weights *= b_scales
|
||||||
|
return F.linear(input, weights, bias)
|
||||||
|
|
||||||
|
|
||||||
|
def dequant_weight_scale(
|
||||||
|
input: torch.Tensor, # [..., in_features]
|
||||||
|
codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
|
||||||
|
codebooks: torch.
|
||||||
|
Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
|
||||||
|
scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
|
||||||
|
output_partition_sizes: torch.IntTensor,
|
||||||
|
bias: Optional[torch.Tensor],
|
||||||
|
) -> torch.Tensor:
|
||||||
|
|
||||||
|
weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
|
||||||
|
|
||||||
|
b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
|
||||||
|
-1, weights.shape[1])
|
||||||
|
weights *= b_scales
|
||||||
|
return F.linear(input, weights, bias)
|
||||||
|
|
||||||
|
|
||||||
|
def dequant_no_scale(
|
||||||
|
input: torch.Tensor, # [..., in_features]
|
||||||
|
codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
|
||||||
|
codebooks: torch.
|
||||||
|
Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
|
||||||
|
scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
|
||||||
|
output_partition_sizes: torch.IntTensor,
|
||||||
|
bias: Optional[torch.Tensor],
|
||||||
|
) -> torch.Tensor:
|
||||||
|
|
||||||
|
weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
|
||||||
|
|
||||||
|
return F.linear(input, weights, bias)
|
||||||
|
|
||||||
|
|
||||||
|
# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
|
||||||
|
# the generic pytorch version.
|
||||||
|
# Just visual comparison.
|
||||||
|
def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
|
||||||
|
|
||||||
|
n = parts.sum().item()
|
||||||
|
|
||||||
|
device = torch.device('cuda:0')
|
||||||
|
|
||||||
|
code_range = (1 << bits) // 2
|
||||||
|
ingroups = 8
|
||||||
|
|
||||||
|
codes = torch.randint(-code_range,
|
||||||
|
code_range,
|
||||||
|
size=(n, k // ingroups, nbooks),
|
||||||
|
dtype=get_int_dtype(bits),
|
||||||
|
device=device)
|
||||||
|
|
||||||
|
codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
|
||||||
|
dtype=torch.float16,
|
||||||
|
device=device)
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
for index in range(16):
|
||||||
|
for i in range(8):
|
||||||
|
for book in range(nbooks):
|
||||||
|
codebooks[book, index, 0, i] = count * (10**book)
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
print("codes shape", codes.shape)
|
||||||
|
|
||||||
|
for i in range(16):
|
||||||
|
for book in range(nbooks):
|
||||||
|
codes[0, i, book] = i
|
||||||
|
codes[0, -i, book] = i
|
||||||
|
|
||||||
|
weights = dequantize_weight(codes, codebooks, None)
|
||||||
|
weights2 = ops.aqlm_dequant(codes, codebooks, parts)
|
||||||
|
|
||||||
|
print("weights shape:", weights.shape)
|
||||||
|
print("weights2 shape:", weights2.shape)
|
||||||
|
|
||||||
|
print("weights are:", weights)
|
||||||
|
print("weights2 are:", weights2)
|
||||||
|
|
||||||
|
print("first 128 weights are", weights[0, 0:128].to(torch.int32))
|
||||||
|
print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32))
|
||||||
|
|
||||||
|
print("last 128 weights are", weights[0, -128:])
|
||||||
|
print("last 128 weights2 are:", weights2[0, -128:])
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Benchmark aqlm performance.")
|
||||||
|
|
||||||
|
# Add arguments
|
||||||
|
parser.add_argument("--nbooks",
|
||||||
|
type=int,
|
||||||
|
default=1,
|
||||||
|
help="Number of codebooks (default: 1)")
|
||||||
|
parser.add_argument("--bits",
|
||||||
|
type=int,
|
||||||
|
default=16,
|
||||||
|
help="Number of bits per code element (default: 16)")
|
||||||
|
parser.add_argument(
|
||||||
|
"--test",
|
||||||
|
type=bool,
|
||||||
|
default=False,
|
||||||
|
help="Run the decompression/dequant tester rather than benchmarking "
|
||||||
|
"(default: False)")
|
||||||
|
|
||||||
|
# Parse the arguments
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Extract values
|
||||||
|
nbooks = args.nbooks
|
||||||
|
bits = args.bits
|
||||||
|
|
||||||
|
if args.test:
|
||||||
|
dequant_test(4096, torch.tensor((4096, )), nbooks, bits)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Otherwise, benchmark.
|
||||||
|
methods = [
|
||||||
|
ops.aqlm_gemm,
|
||||||
|
dequant_out_scale,
|
||||||
|
generic_dequantize_gemm,
|
||||||
|
optimized_dequantize_gemm,
|
||||||
|
dequant_weight_scale,
|
||||||
|
torch_mult,
|
||||||
|
dequant_no_scale,
|
||||||
|
]
|
||||||
|
|
||||||
|
filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv"
|
||||||
|
print(f"writing benchmarks to file {filename}")
|
||||||
|
with open(filename, "w") as f:
|
||||||
|
sys.stdout = f
|
||||||
|
|
||||||
|
print('m | k | n | n parts', end='')
|
||||||
|
for method in methods:
|
||||||
|
print(f" | {method.__name__.replace('_', ' ')} (µs)", end='')
|
||||||
|
print('')
|
||||||
|
|
||||||
|
# These are reasonable prefill sizes.
|
||||||
|
ksandpartions = ((4096, (4096, 4096, 4096)), (4096, (4096, )),
|
||||||
|
(4096, (11008, 11008)), (11008, (4096, )))
|
||||||
|
|
||||||
|
# reasonable ranges for m.
|
||||||
|
for m in [
|
||||||
|
1, 2, 4, 8, 10, 12, 14, 16, 24, 32, 48, 52, 56, 64, 96, 112,
|
||||||
|
128, 256, 512, 1024, 1536, 2048, 3072, 4096
|
||||||
|
]:
|
||||||
|
print(f'{m}', file=sys.__stdout__)
|
||||||
|
for ksp in ksandpartions:
|
||||||
|
run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits,
|
||||||
|
methods)
|
||||||
|
|
||||||
|
sys.stdout = sys.__stdout__
|
||||||
|
|
||||||
|
|
||||||
|
def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
|
||||||
|
methods):
|
||||||
|
|
||||||
|
# I didn't see visible improvements from increasing these, but feel free :)
|
||||||
|
num_warmup_trials = 1
|
||||||
|
num_trials = 1
|
||||||
|
|
||||||
|
num_calls = 100
|
||||||
|
|
||||||
|
# warmup.
|
||||||
|
for method in methods:
|
||||||
|
for _ in range(num_warmup_trials):
|
||||||
|
run_timing(
|
||||||
|
num_calls=num_calls,
|
||||||
|
m=m,
|
||||||
|
k=k,
|
||||||
|
parts=parts,
|
||||||
|
nbooks=nbooks,
|
||||||
|
bits=bits,
|
||||||
|
method=method,
|
||||||
|
)
|
||||||
|
|
||||||
|
n = parts.sum().item()
|
||||||
|
print(f'{m} | {k} | {n} | {parts.tolist()}', end='')
|
||||||
|
|
||||||
|
for method in methods:
|
||||||
|
best_time_us = 1e20
|
||||||
|
for _ in range(num_trials):
|
||||||
|
kernel_dur_ms = run_timing(
|
||||||
|
num_calls=num_calls,
|
||||||
|
m=m,
|
||||||
|
k=k,
|
||||||
|
parts=parts,
|
||||||
|
nbooks=nbooks,
|
||||||
|
bits=bits,
|
||||||
|
method=method,
|
||||||
|
)
|
||||||
|
|
||||||
|
kernel_dur_us = 1000 * kernel_dur_ms
|
||||||
|
|
||||||
|
if kernel_dur_us < best_time_us:
|
||||||
|
best_time_us = kernel_dur_us
|
||||||
|
|
||||||
|
print(f' | {kernel_dur_us:.0f}', end='')
|
||||||
|
|
||||||
|
print('')
|
||||||
|
|
||||||
|
|
||||||
|
def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor,
|
||||||
|
nbooks: int, bits: int, method) -> float:
|
||||||
|
|
||||||
|
n = parts.sum().item()
|
||||||
|
|
||||||
|
device = torch.device('cuda:0')
|
||||||
|
|
||||||
|
input = torch.randn((1, m, k), dtype=torch.float16, device=device)
|
||||||
|
|
||||||
|
code_range = (1 << bits) // 2
|
||||||
|
ingroups = 8
|
||||||
|
|
||||||
|
codes = torch.randint(-code_range,
|
||||||
|
code_range,
|
||||||
|
size=(n, k // ingroups, nbooks),
|
||||||
|
dtype=get_int_dtype(bits),
|
||||||
|
device=device)
|
||||||
|
|
||||||
|
codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
|
||||||
|
dtype=torch.float16,
|
||||||
|
device=device)
|
||||||
|
|
||||||
|
scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device)
|
||||||
|
|
||||||
|
# for comparison to just a pytorch mult.
|
||||||
|
weights = torch.randn((n, k), dtype=torch.float16, device=device)
|
||||||
|
|
||||||
|
start_event = torch.cuda.Event(enable_timing=True)
|
||||||
|
end_event = torch.cuda.Event(enable_timing=True)
|
||||||
|
|
||||||
|
start_event.record()
|
||||||
|
|
||||||
|
if method is torch_mult:
|
||||||
|
for i in range(num_calls):
|
||||||
|
torch_mult(input, weights, scales)
|
||||||
|
else:
|
||||||
|
for i in range(num_calls):
|
||||||
|
method(input, codes, codebooks, scales, parts, None)
|
||||||
|
|
||||||
|
end_event.record()
|
||||||
|
end_event.synchronize()
|
||||||
|
|
||||||
|
dur_ms = start_event.elapsed_time(end_event) / num_calls
|
||||||
|
return dur_ms
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
233
benchmarks/kernels/benchmark_marlin.py
Normal file
233
benchmarks/kernels/benchmark_marlin.py
Normal file
@ -0,0 +1,233 @@
|
|||||||
|
import argparse
|
||||||
|
|
||||||
|
import torch
|
||||||
|
import torch.utils.benchmark as benchmark
|
||||||
|
from benchmark_shapes import WEIGHT_SHAPES
|
||||||
|
|
||||||
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm.model_executor.layers.quantization.gptq_marlin import (
|
||||||
|
GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
|
||||||
|
GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
|
||||||
|
from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
|
||||||
|
GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
|
||||||
|
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
|
||||||
|
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
|
||||||
|
MarlinWorkspace, marlin_24_quantize, marlin_quantize)
|
||||||
|
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||||
|
gptq_pack, quantize_weights, sort_weights)
|
||||||
|
|
||||||
|
DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
|
||||||
|
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
||||||
|
|
||||||
|
ACT_ORDER_OPTS = [False, True]
|
||||||
|
K_FULL_OPTS = [False, True]
|
||||||
|
|
||||||
|
|
||||||
|
def bench_run(results, model, act_order, is_k_full, num_bits, group_size,
|
||||||
|
size_m, size_k, size_n):
|
||||||
|
label = "Quant Matmul"
|
||||||
|
|
||||||
|
sub_label = ("{}, act={} k_full={}, b={}, g={}, "
|
||||||
|
"MKN=({}x{}x{})".format(model, act_order, is_k_full, num_bits,
|
||||||
|
group_size, size_m, size_k, size_n))
|
||||||
|
|
||||||
|
print(f"Testing: {sub_label}")
|
||||||
|
|
||||||
|
a = torch.randn(size_m, size_k).to(torch.half).cuda()
|
||||||
|
b = torch.rand(size_k, size_n).to(torch.half).cuda()
|
||||||
|
|
||||||
|
a_tmp = (torch.zeros(size_m, size_k).to(torch.half).cuda())
|
||||||
|
|
||||||
|
# Marlin quant
|
||||||
|
(
|
||||||
|
marlin_w_ref,
|
||||||
|
marlin_q_w,
|
||||||
|
marlin_s,
|
||||||
|
marlin_g_idx,
|
||||||
|
marlin_sort_indices,
|
||||||
|
marlin_rand_perm,
|
||||||
|
) = marlin_quantize(b, num_bits, group_size, act_order)
|
||||||
|
|
||||||
|
# Marlin_24 quant
|
||||||
|
(marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta,
|
||||||
|
marlin_24_s) = marlin_24_quantize(b, num_bits, group_size)
|
||||||
|
|
||||||
|
# GPTQ quant
|
||||||
|
(w_ref, q_w, s, g_idx,
|
||||||
|
rand_perm) = quantize_weights(b, num_bits, group_size, act_order)
|
||||||
|
q_w_gptq = gptq_pack(q_w, num_bits, size_k, size_n)
|
||||||
|
|
||||||
|
# For act_order, sort the "weights" and "g_idx"
|
||||||
|
# so that group ids are increasing
|
||||||
|
repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
|
||||||
|
if act_order:
|
||||||
|
(q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
|
||||||
|
|
||||||
|
# Prepare
|
||||||
|
marlin_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
|
||||||
|
GPTQ_MARLIN_MAX_PARALLEL)
|
||||||
|
|
||||||
|
marlin_24_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N,
|
||||||
|
GPTQ_MARLIN_24_MAX_PARALLEL)
|
||||||
|
|
||||||
|
globals = {
|
||||||
|
# Gen params
|
||||||
|
"num_bits": num_bits,
|
||||||
|
"group_size": group_size,
|
||||||
|
"size_m": size_m,
|
||||||
|
"size_n": size_n,
|
||||||
|
"size_k": size_k,
|
||||||
|
"a": a,
|
||||||
|
"a_tmp": a_tmp,
|
||||||
|
# Marlin params
|
||||||
|
"marlin_w_ref": marlin_w_ref,
|
||||||
|
"marlin_q_w": marlin_q_w,
|
||||||
|
"marlin_s": marlin_s,
|
||||||
|
"marlin_g_idx": marlin_g_idx,
|
||||||
|
"marlin_sort_indices": marlin_sort_indices,
|
||||||
|
"marlin_rand_perm": marlin_rand_perm,
|
||||||
|
"marlin_workspace": marlin_workspace,
|
||||||
|
"is_k_full": is_k_full,
|
||||||
|
# Marlin_24 params
|
||||||
|
"marlin_24_w_ref": marlin_24_w_ref,
|
||||||
|
"marlin_24_q_w_comp": marlin_24_q_w_comp,
|
||||||
|
"marlin_24_meta": marlin_24_meta,
|
||||||
|
"marlin_24_s": marlin_24_s,
|
||||||
|
"marlin_24_workspace": marlin_24_workspace,
|
||||||
|
# GPTQ params
|
||||||
|
"q_w_gptq": q_w_gptq,
|
||||||
|
"repack_sort_indices": repack_sort_indices,
|
||||||
|
# Kernels
|
||||||
|
"gptq_marlin_gemm": ops.gptq_marlin_gemm,
|
||||||
|
"gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
|
||||||
|
"gptq_marlin_repack": ops.gptq_marlin_repack,
|
||||||
|
}
|
||||||
|
|
||||||
|
min_run_time = 1
|
||||||
|
|
||||||
|
# Warmup pytorch
|
||||||
|
for i in range(5):
|
||||||
|
torch.matmul(a, marlin_w_ref)
|
||||||
|
|
||||||
|
results.append(
|
||||||
|
benchmark.Timer(
|
||||||
|
stmt="torch.matmul(a, marlin_w_ref)",
|
||||||
|
globals=globals,
|
||||||
|
label=label,
|
||||||
|
sub_label=sub_label,
|
||||||
|
description="pytorch_gemm",
|
||||||
|
).blocked_autorange(min_run_time=min_run_time))
|
||||||
|
|
||||||
|
results.append(
|
||||||
|
benchmark.Timer(
|
||||||
|
stmt=
|
||||||
|
"output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full)", # noqa: E501
|
||||||
|
globals=globals,
|
||||||
|
label=label,
|
||||||
|
sub_label=sub_label,
|
||||||
|
description="gptq_marlin_gemm",
|
||||||
|
).blocked_autorange(min_run_time=min_run_time))
|
||||||
|
|
||||||
|
if (num_bits in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
|
||||||
|
and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES):
|
||||||
|
results.append(
|
||||||
|
benchmark.Timer(
|
||||||
|
stmt=
|
||||||
|
"output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, num_bits, size_m, size_n, size_k)", # noqa: E501
|
||||||
|
globals=globals,
|
||||||
|
label=label,
|
||||||
|
sub_label=sub_label,
|
||||||
|
description="gptq_marlin_24_gemm",
|
||||||
|
).blocked_autorange(min_run_time=min_run_time))
|
||||||
|
|
||||||
|
results.append(
|
||||||
|
benchmark.Timer(
|
||||||
|
stmt=
|
||||||
|
"q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, num_bits)", # noqa: E501
|
||||||
|
globals=globals,
|
||||||
|
label=label,
|
||||||
|
sub_label=sub_label,
|
||||||
|
description="gptq_marlin_repack",
|
||||||
|
).blocked_autorange(min_run_time=min_run_time))
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
print("Benchmarking models:")
|
||||||
|
for i, model in enumerate(args.models):
|
||||||
|
print(f"[{i}] {model}")
|
||||||
|
|
||||||
|
results = []
|
||||||
|
|
||||||
|
for model in args.models:
|
||||||
|
for layer in WEIGHT_SHAPES[model]:
|
||||||
|
size_k = layer[0]
|
||||||
|
size_n = layer[1]
|
||||||
|
|
||||||
|
if len(args.limit_k) > 0 and size_k not in args.limit_k:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if len(args.limit_n) > 0 and size_n not in args.limit_n:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for act_order in ACT_ORDER_OPTS:
|
||||||
|
if len(args.limit_act_order
|
||||||
|
) > 0 and act_order not in args.limit_act_order:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for is_k_full in K_FULL_OPTS:
|
||||||
|
if len(args.limit_k_full
|
||||||
|
) > 0 and is_k_full not in args.limit_k_full:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
|
||||||
|
if len(args.limit_num_bits
|
||||||
|
) > 0 and num_bits not in args.limit_num_bits:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES:
|
||||||
|
if len(
|
||||||
|
args.limit_group_size
|
||||||
|
) > 0 and group_size not in args.limit_group_size:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# For act_order, the group_size must be less than
|
||||||
|
# size_k
|
||||||
|
if act_order and (group_size == size_k
|
||||||
|
or group_size == -1):
|
||||||
|
continue
|
||||||
|
|
||||||
|
for size_m in args.batch_sizes:
|
||||||
|
bench_run(results, model, act_order, is_k_full,
|
||||||
|
num_bits, group_size, size_m, size_k,
|
||||||
|
size_n)
|
||||||
|
|
||||||
|
compare = benchmark.Compare(results)
|
||||||
|
compare.print()
|
||||||
|
|
||||||
|
|
||||||
|
# For quick benchmarking use:
|
||||||
|
# python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501
|
||||||
|
#
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Benchmark Marlin across specified models/shapes/batches")
|
||||||
|
parser.add_argument(
|
||||||
|
"--models",
|
||||||
|
nargs="+",
|
||||||
|
type=str,
|
||||||
|
default=DEFAULT_MODELS,
|
||||||
|
choices=WEIGHT_SHAPES.keys(),
|
||||||
|
)
|
||||||
|
parser.add_argument("--batch-sizes",
|
||||||
|
nargs="+",
|
||||||
|
type=int,
|
||||||
|
default=DEFAULT_BATCH_SIZES)
|
||||||
|
parser.add_argument("--limit-k", nargs="+", type=int, default=[])
|
||||||
|
parser.add_argument("--limit-n", nargs="+", type=int, default=[])
|
||||||
|
parser.add_argument("--limit-group-size", nargs="+", type=int, default=[])
|
||||||
|
parser.add_argument("--limit-num-bits", nargs="+", type=int, default=[])
|
||||||
|
parser.add_argument("--limit-act-order", nargs="+", type=int, default=[])
|
||||||
|
parser.add_argument("--limit-k-full", nargs="+", type=int, default=[])
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args)
|
||||||
322
benchmarks/kernels/benchmark_moe.py
Normal file
322
benchmarks/kernels/benchmark_moe.py
Normal file
@ -0,0 +1,322 @@
|
|||||||
|
import argparse
|
||||||
|
import time
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Any, Dict, List, Tuple
|
||||||
|
|
||||||
|
import ray
|
||||||
|
import torch
|
||||||
|
import triton
|
||||||
|
from ray.experimental.tqdm_ray import tqdm
|
||||||
|
from transformers import AutoConfig
|
||||||
|
|
||||||
|
from vllm.model_executor.layers.fused_moe.fused_moe import *
|
||||||
|
|
||||||
|
|
||||||
|
def benchmark_config(
|
||||||
|
config: Dict[str, int],
|
||||||
|
num_tokens: int,
|
||||||
|
num_experts: int,
|
||||||
|
shard_intermediate_size: int,
|
||||||
|
hidden_size: int,
|
||||||
|
topk: int,
|
||||||
|
dtype: torch.dtype,
|
||||||
|
use_fp8: bool,
|
||||||
|
num_iters: int = 100,
|
||||||
|
) -> float:
|
||||||
|
init_dtype = torch.float16 if use_fp8 else dtype
|
||||||
|
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
|
||||||
|
w1 = torch.randn(num_experts,
|
||||||
|
shard_intermediate_size,
|
||||||
|
hidden_size,
|
||||||
|
dtype=init_dtype)
|
||||||
|
w2 = torch.randn(num_experts,
|
||||||
|
hidden_size,
|
||||||
|
shard_intermediate_size // 2,
|
||||||
|
dtype=init_dtype)
|
||||||
|
gating_output = torch.randn(num_iters,
|
||||||
|
num_tokens,
|
||||||
|
num_experts,
|
||||||
|
dtype=torch.float32)
|
||||||
|
|
||||||
|
w1_scale = None
|
||||||
|
w2_scale = None
|
||||||
|
a1_scale = None
|
||||||
|
a2_scale = None
|
||||||
|
if use_fp8:
|
||||||
|
w1_scale = torch.randn(num_experts, dtype=torch.float32)
|
||||||
|
w2_scale = torch.randn(num_experts, dtype=torch.float32)
|
||||||
|
a1_scale = torch.randn(1, dtype=torch.float32)
|
||||||
|
a2_scale = torch.randn(1, dtype=torch.float32)
|
||||||
|
|
||||||
|
w1 = w1.to(torch.float8_e4m3fn)
|
||||||
|
w2 = w2.to(torch.float8_e4m3fn)
|
||||||
|
|
||||||
|
input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32)
|
||||||
|
|
||||||
|
def prepare(i: int):
|
||||||
|
input_gating.copy_(gating_output[i])
|
||||||
|
|
||||||
|
def run():
|
||||||
|
fused_moe(
|
||||||
|
x,
|
||||||
|
w1,
|
||||||
|
w2,
|
||||||
|
input_gating,
|
||||||
|
topk,
|
||||||
|
renormalize=True,
|
||||||
|
inplace=True,
|
||||||
|
override_config=config,
|
||||||
|
use_fp8=use_fp8,
|
||||||
|
w1_scale=w1_scale,
|
||||||
|
w2_scale=w2_scale,
|
||||||
|
a1_scale=a1_scale,
|
||||||
|
a2_scale=a2_scale,
|
||||||
|
)
|
||||||
|
|
||||||
|
# JIT compilation & warmup
|
||||||
|
run()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
# Capture 10 invocations with CUDA graph
|
||||||
|
graph = torch.cuda.CUDAGraph()
|
||||||
|
with torch.cuda.graph(graph):
|
||||||
|
for _ in range(10):
|
||||||
|
run()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
# Warmup
|
||||||
|
for _ in range(5):
|
||||||
|
graph.replay()
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
start_event = torch.cuda.Event(enable_timing=True)
|
||||||
|
end_event = torch.cuda.Event(enable_timing=True)
|
||||||
|
|
||||||
|
latencies = []
|
||||||
|
for i in range(num_iters):
|
||||||
|
prepare(i)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
start_event.record()
|
||||||
|
graph.replay()
|
||||||
|
end_event.record()
|
||||||
|
end_event.synchronize()
|
||||||
|
latencies.append(start_event.elapsed_time(end_event))
|
||||||
|
avg = sum(latencies) / (num_iters * 10) * 1000 # us
|
||||||
|
graph.reset()
|
||||||
|
return avg
|
||||||
|
|
||||||
|
|
||||||
|
def get_configs_compute_bound() -> List[Dict[str, int]]:
|
||||||
|
# Reduced search space for faster tuning.
|
||||||
|
# TODO(woosuk): Increase the search space and use a performance model to
|
||||||
|
# prune the search space.
|
||||||
|
configs = []
|
||||||
|
for num_stages in [2, 3, 4, 5]:
|
||||||
|
for block_m in [16, 32, 64, 128, 256]:
|
||||||
|
for block_k in [64, 128, 256]:
|
||||||
|
for block_n in [32, 64, 128, 256]:
|
||||||
|
for num_warps in [4, 8]:
|
||||||
|
for group_size in [1, 16, 32, 64]:
|
||||||
|
configs.append({
|
||||||
|
"BLOCK_SIZE_M": block_m,
|
||||||
|
"BLOCK_SIZE_N": block_n,
|
||||||
|
"BLOCK_SIZE_K": block_k,
|
||||||
|
"GROUP_SIZE_M": group_size,
|
||||||
|
"num_warps": num_warps,
|
||||||
|
"num_stages": num_stages,
|
||||||
|
})
|
||||||
|
return configs
|
||||||
|
|
||||||
|
|
||||||
|
@ray.remote(num_gpus=1)
|
||||||
|
class BenchmarkWorker:
|
||||||
|
|
||||||
|
def __init__(self, seed: int) -> None:
|
||||||
|
torch.set_default_device("cuda")
|
||||||
|
torch.cuda.manual_seed_all(seed)
|
||||||
|
self.seed = seed
|
||||||
|
|
||||||
|
def benchmark(
|
||||||
|
self,
|
||||||
|
num_tokens: int,
|
||||||
|
num_experts: int,
|
||||||
|
shard_intermediate_size: int,
|
||||||
|
hidden_size: int,
|
||||||
|
topk: int,
|
||||||
|
dtype: torch.dtype,
|
||||||
|
use_fp8: bool,
|
||||||
|
) -> Tuple[Dict[str, int], float]:
|
||||||
|
torch.cuda.manual_seed_all(self.seed)
|
||||||
|
|
||||||
|
dtype_str = "float8" if use_fp8 else None
|
||||||
|
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
|
||||||
|
# is the intermediate size after silu_and_mul.
|
||||||
|
op_config = get_moe_configs(num_experts, shard_intermediate_size // 2,
|
||||||
|
dtype_str)
|
||||||
|
if op_config is None:
|
||||||
|
config = get_default_config(num_tokens, num_experts,
|
||||||
|
shard_intermediate_size, hidden_size,
|
||||||
|
topk, dtype_str)
|
||||||
|
else:
|
||||||
|
config = op_config[min(op_config.keys(),
|
||||||
|
key=lambda x: abs(x - num_tokens))]
|
||||||
|
kernel_time = benchmark_config(config, num_tokens, num_experts,
|
||||||
|
shard_intermediate_size, hidden_size,
|
||||||
|
topk, dtype, use_fp8)
|
||||||
|
return config, kernel_time
|
||||||
|
|
||||||
|
def tune(
|
||||||
|
self,
|
||||||
|
num_tokens: int,
|
||||||
|
num_experts: int,
|
||||||
|
shard_intermediate_size: int,
|
||||||
|
hidden_size: int,
|
||||||
|
topk: int,
|
||||||
|
dtype: torch.dtype,
|
||||||
|
use_fp8: bool,
|
||||||
|
search_space: List[Dict[str, int]],
|
||||||
|
) -> Dict[str, int]:
|
||||||
|
best_config = None
|
||||||
|
best_time = float("inf")
|
||||||
|
for config in tqdm(search_space):
|
||||||
|
try:
|
||||||
|
kernel_time = benchmark_config(config,
|
||||||
|
num_tokens,
|
||||||
|
num_experts,
|
||||||
|
shard_intermediate_size,
|
||||||
|
hidden_size,
|
||||||
|
topk,
|
||||||
|
dtype,
|
||||||
|
use_fp8,
|
||||||
|
num_iters=10)
|
||||||
|
except triton.runtime.autotuner.OutOfResources:
|
||||||
|
# Some configurations may be invalid and fail to compile.
|
||||||
|
continue
|
||||||
|
|
||||||
|
if kernel_time < best_time:
|
||||||
|
best_time = kernel_time
|
||||||
|
best_config = config
|
||||||
|
now = datetime.now()
|
||||||
|
print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
|
||||||
|
return best_config
|
||||||
|
|
||||||
|
|
||||||
|
def sort_config(config: Dict[str, int]) -> Dict[str, int]:
|
||||||
|
return {
|
||||||
|
"BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
|
||||||
|
"BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
|
||||||
|
"BLOCK_SIZE_K": config["BLOCK_SIZE_K"],
|
||||||
|
"GROUP_SIZE_M": config["GROUP_SIZE_M"],
|
||||||
|
"num_warps": config["num_warps"],
|
||||||
|
"num_stages": config["num_stages"],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def save_configs(
|
||||||
|
configs: Dict[int, Dict[str, int]],
|
||||||
|
num_experts: int,
|
||||||
|
shard_intermediate_size: int,
|
||||||
|
hidden_size: int,
|
||||||
|
topk: int,
|
||||||
|
dtype: torch.dtype,
|
||||||
|
use_fp8: bool,
|
||||||
|
) -> None:
|
||||||
|
dtype_str = "float8" if use_fp8 else None
|
||||||
|
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
|
||||||
|
# is the intermediate size after silu_and_mul.
|
||||||
|
filename = get_config_file_name(num_experts, shard_intermediate_size // 2,
|
||||||
|
dtype_str)
|
||||||
|
print(f"Writing best config to {filename}...")
|
||||||
|
with open(filename, "w") as f:
|
||||||
|
json.dump(configs, f, indent=4)
|
||||||
|
f.write("\n")
|
||||||
|
|
||||||
|
|
||||||
|
def main(args: argparse.Namespace):
|
||||||
|
print(args)
|
||||||
|
|
||||||
|
config = AutoConfig.from_pretrained(args.model)
|
||||||
|
if config.architectures[0] == "DbrxForCausalLM":
|
||||||
|
E = config.ffn_config.moe_num_experts
|
||||||
|
topk = config.ffn_config.moe_top_k
|
||||||
|
intermediate_size = config.ffn_config.ffn_hidden_size
|
||||||
|
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||||
|
else:
|
||||||
|
# Default: Mixtral.
|
||||||
|
E = config.num_local_experts
|
||||||
|
topk = config.num_experts_per_tok
|
||||||
|
intermediate_size = config.intermediate_size
|
||||||
|
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||||
|
|
||||||
|
hidden_size = config.hidden_size
|
||||||
|
dtype = config.torch_dtype
|
||||||
|
use_fp8 = args.dtype == "fp8"
|
||||||
|
|
||||||
|
if args.batch_size is None:
|
||||||
|
batch_sizes = [
|
||||||
|
1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
|
||||||
|
2048, 3072, 4096
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
batch_sizes = [args.batch_size]
|
||||||
|
|
||||||
|
ray.init()
|
||||||
|
num_gpus = int(ray.available_resources()["GPU"])
|
||||||
|
workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
|
||||||
|
|
||||||
|
def _distribute(method: str, inputs: List[Any]) -> List[Any]:
|
||||||
|
outputs = []
|
||||||
|
worker_idx = 0
|
||||||
|
for input_args in inputs:
|
||||||
|
worker = workers[worker_idx]
|
||||||
|
worker_method = getattr(worker, method)
|
||||||
|
output = worker_method.remote(*input_args)
|
||||||
|
outputs.append(output)
|
||||||
|
worker_idx = (worker_idx + 1) % num_gpus
|
||||||
|
return ray.get(outputs)
|
||||||
|
|
||||||
|
if args.tune:
|
||||||
|
search_space = get_configs_compute_bound()
|
||||||
|
print(f"Start tuning over {len(search_space)} configurations...")
|
||||||
|
|
||||||
|
start = time.time()
|
||||||
|
configs = _distribute(
|
||||||
|
"tune", [(batch_size, E, shard_intermediate_size, hidden_size,
|
||||||
|
topk, dtype, use_fp8, search_space)
|
||||||
|
for batch_size in batch_sizes])
|
||||||
|
best_configs = {
|
||||||
|
M: sort_config(config)
|
||||||
|
for M, config in zip(batch_sizes, configs)
|
||||||
|
}
|
||||||
|
save_configs(best_configs, E, shard_intermediate_size, hidden_size,
|
||||||
|
topk, dtype, use_fp8)
|
||||||
|
end = time.time()
|
||||||
|
print(f"Tuning took {end - start:.2f} seconds")
|
||||||
|
else:
|
||||||
|
outputs = _distribute("benchmark",
|
||||||
|
[(batch_size, E, shard_intermediate_size,
|
||||||
|
hidden_size, topk, dtype, use_fp8)
|
||||||
|
for batch_size in batch_sizes])
|
||||||
|
|
||||||
|
for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
|
||||||
|
print(f"Batch size: {batch_size}, config: {config}")
|
||||||
|
print(f"Kernel time: {kernel_time:.2f} us")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("--model",
|
||||||
|
type=str,
|
||||||
|
default="mistralai/Mixtral-8x7B-Instruct-v0.1")
|
||||||
|
parser.add_argument("--tp-size", "-tp", type=int, default=2)
|
||||||
|
parser.add_argument("--dtype",
|
||||||
|
type=str,
|
||||||
|
choices=["auto", "fp8"],
|
||||||
|
default="auto")
|
||||||
|
parser.add_argument("--seed", type=int, default=0)
|
||||||
|
parser.add_argument("--batch-size", type=int, required=False)
|
||||||
|
parser.add_argument("--tune", action="store_true")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
main(args)
|
||||||
@ -1,10 +1,12 @@
|
|||||||
import argparse
|
import argparse
|
||||||
import random
|
import random
|
||||||
import time
|
import time
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from vllm import attention_ops
|
from vllm import _custom_ops as ops
|
||||||
|
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
|
||||||
|
|
||||||
NUM_BLOCKS = 1024
|
NUM_BLOCKS = 1024
|
||||||
PARTITION_SIZE = 512
|
PARTITION_SIZE = 512
|
||||||
@ -14,7 +16,7 @@ PARTITION_SIZE = 512
|
|||||||
def main(
|
def main(
|
||||||
version: str,
|
version: str,
|
||||||
num_seqs: int,
|
num_seqs: int,
|
||||||
context_len: int,
|
seq_len: int,
|
||||||
num_query_heads: int,
|
num_query_heads: int,
|
||||||
num_kv_heads: int,
|
num_kv_heads: int,
|
||||||
head_size: int,
|
head_size: int,
|
||||||
@ -23,36 +25,35 @@ def main(
|
|||||||
dtype: torch.dtype,
|
dtype: torch.dtype,
|
||||||
seed: int,
|
seed: int,
|
||||||
do_profile: bool,
|
do_profile: bool,
|
||||||
|
device: str = "cuda",
|
||||||
|
kv_cache_dtype: Optional[str] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
random.seed(seed)
|
random.seed(seed)
|
||||||
torch.random.manual_seed(seed)
|
torch.random.manual_seed(seed)
|
||||||
torch.cuda.manual_seed(seed)
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.manual_seed(seed)
|
||||||
|
|
||||||
scale = float(1.0 / (head_size**0.5))
|
scale = float(1.0 / (head_size**0.5))
|
||||||
query = torch.empty(num_seqs,
|
query = torch.empty(num_seqs,
|
||||||
num_query_heads,
|
num_query_heads,
|
||||||
head_size,
|
head_size,
|
||||||
dtype=dtype,
|
dtype=dtype,
|
||||||
device="cuda")
|
device=device)
|
||||||
query.uniform_(-scale, scale)
|
query.uniform_(-scale, scale)
|
||||||
|
|
||||||
assert num_query_heads % num_kv_heads == 0
|
assert num_query_heads % num_kv_heads == 0
|
||||||
num_queries_per_kv = num_query_heads // num_kv_heads
|
|
||||||
head_mapping = torch.repeat_interleave(
|
|
||||||
torch.arange(num_kv_heads, dtype=torch.int32, device="cuda"),
|
|
||||||
num_queries_per_kv)
|
|
||||||
alibi_slopes = None
|
alibi_slopes = None
|
||||||
if use_alibi:
|
if use_alibi:
|
||||||
alibi_slopes = torch.randn(num_query_heads,
|
alibi_slopes = torch.randn(num_query_heads,
|
||||||
dtype=torch.float,
|
dtype=torch.float,
|
||||||
device="cuda")
|
device=device)
|
||||||
|
|
||||||
context_lens = [context_len for _ in range(num_seqs)]
|
seq_lens = [seq_len for _ in range(num_seqs)]
|
||||||
max_context_len = max(context_lens)
|
max_seq_len = max(seq_lens)
|
||||||
context_lens = torch.tensor(context_lens, dtype=torch.int, device="cuda")
|
seq_lens = torch.tensor(seq_lens, dtype=torch.int, device=device)
|
||||||
|
|
||||||
# Create the block tables.
|
# Create the block tables.
|
||||||
max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
|
max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
|
||||||
block_tables = []
|
block_tables = []
|
||||||
for _ in range(num_seqs):
|
for _ in range(num_seqs):
|
||||||
block_table = [
|
block_table = [
|
||||||
@ -60,24 +61,23 @@ def main(
|
|||||||
for _ in range(max_num_blocks_per_seq)
|
for _ in range(max_num_blocks_per_seq)
|
||||||
]
|
]
|
||||||
block_tables.append(block_table)
|
block_tables.append(block_table)
|
||||||
block_tables = torch.tensor(block_tables, dtype=torch.int, device="cuda")
|
block_tables = torch.tensor(block_tables, dtype=torch.int, device=device)
|
||||||
|
|
||||||
# Create the KV cache.
|
# Create the KV cache.
|
||||||
x = 16 // torch.tensor([], dtype=dtype).element_size()
|
key_caches, value_caches = create_kv_caches_with_random(NUM_BLOCKS,
|
||||||
key_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size // x, block_size, x)
|
block_size,
|
||||||
key_cache = torch.empty(size=key_cache_shape, dtype=dtype, device="cuda")
|
1,
|
||||||
key_cache.uniform_(-scale, scale)
|
num_kv_heads,
|
||||||
value_cache_shape = (NUM_BLOCKS, num_kv_heads, head_size, block_size)
|
head_size,
|
||||||
value_cache = torch.empty(size=value_cache_shape,
|
kv_cache_dtype,
|
||||||
dtype=dtype,
|
dtype,
|
||||||
device="cuda")
|
device=device)
|
||||||
value_cache.uniform_(-scale, scale)
|
key_cache, value_cache = key_caches[0], value_caches[0]
|
||||||
|
|
||||||
# Prepare for the paged attention kernel.
|
# Prepare for the paged attention kernel.
|
||||||
output = torch.empty_like(query)
|
output = torch.empty_like(query)
|
||||||
if version == "v2":
|
if version == "v2":
|
||||||
num_partitions = ((max_context_len + PARTITION_SIZE - 1) //
|
num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
|
||||||
PARTITION_SIZE)
|
|
||||||
tmp_output = torch.empty(
|
tmp_output = torch.empty(
|
||||||
size=(num_seqs, num_query_heads, num_partitions, head_size),
|
size=(num_seqs, num_query_heads, num_partitions, head_size),
|
||||||
dtype=output.dtype,
|
dtype=output.dtype,
|
||||||
@ -90,29 +90,34 @@ def main(
|
|||||||
)
|
)
|
||||||
max_logits = torch.empty_like(exp_sums)
|
max_logits = torch.empty_like(exp_sums)
|
||||||
|
|
||||||
def run_benchmark(num_iters: int, profile: bool = False) -> float:
|
def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
|
||||||
torch.cuda.synchronize()
|
torch.cuda.synchronize()
|
||||||
if profile:
|
if profile:
|
||||||
torch.cuda.cudart().cudaProfilerStart()
|
torch.cuda.cudart().cudaProfilerStart()
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
|
|
||||||
|
# Using default kv_scale
|
||||||
|
kv_scale = 1.0
|
||||||
|
|
||||||
for _ in range(num_iters):
|
for _ in range(num_iters):
|
||||||
if version == "v1":
|
if version == "v1":
|
||||||
attention_ops.paged_attention_v1(
|
ops.paged_attention_v1(
|
||||||
output,
|
output,
|
||||||
query,
|
query,
|
||||||
key_cache,
|
key_cache,
|
||||||
value_cache,
|
value_cache,
|
||||||
head_mapping,
|
num_kv_heads,
|
||||||
scale,
|
scale,
|
||||||
block_tables,
|
block_tables,
|
||||||
context_lens,
|
seq_lens,
|
||||||
block_size,
|
block_size,
|
||||||
max_context_len,
|
max_seq_len,
|
||||||
alibi_slopes,
|
alibi_slopes,
|
||||||
|
kv_cache_dtype,
|
||||||
|
kv_scale,
|
||||||
)
|
)
|
||||||
elif version == "v2":
|
elif version == "v2":
|
||||||
attention_ops.paged_attention_v2(
|
ops.paged_attention_v2(
|
||||||
output,
|
output,
|
||||||
exp_sums,
|
exp_sums,
|
||||||
max_logits,
|
max_logits,
|
||||||
@ -120,13 +125,15 @@ def main(
|
|||||||
query,
|
query,
|
||||||
key_cache,
|
key_cache,
|
||||||
value_cache,
|
value_cache,
|
||||||
head_mapping,
|
num_kv_heads,
|
||||||
scale,
|
scale,
|
||||||
block_tables,
|
block_tables,
|
||||||
context_lens,
|
seq_lens,
|
||||||
block_size,
|
block_size,
|
||||||
max_context_len,
|
max_seq_len,
|
||||||
alibi_slopes,
|
alibi_slopes,
|
||||||
|
kv_cache_dtype,
|
||||||
|
kv_scale,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Invalid version: {version}")
|
raise ValueError(f"Invalid version: {version}")
|
||||||
@ -139,6 +146,7 @@ def main(
|
|||||||
|
|
||||||
# Warmup.
|
# Warmup.
|
||||||
print("Warming up...")
|
print("Warming up...")
|
||||||
|
run_benchmark = run_cuda_benchmark
|
||||||
run_benchmark(num_iters=3, profile=False)
|
run_benchmark(num_iters=3, profile=False)
|
||||||
|
|
||||||
# Benchmark.
|
# Benchmark.
|
||||||
@ -157,12 +165,12 @@ if __name__ == '__main__':
|
|||||||
choices=["v1", "v2"],
|
choices=["v1", "v2"],
|
||||||
default="v2")
|
default="v2")
|
||||||
parser.add_argument("--batch-size", type=int, default=8)
|
parser.add_argument("--batch-size", type=int, default=8)
|
||||||
parser.add_argument("--context-len", type=int, default=4096)
|
parser.add_argument("--seq_len", type=int, default=4096)
|
||||||
parser.add_argument("--num-query-heads", type=int, default=64)
|
parser.add_argument("--num-query-heads", type=int, default=64)
|
||||||
parser.add_argument("--num-kv-heads", type=int, default=8)
|
parser.add_argument("--num-kv-heads", type=int, default=8)
|
||||||
parser.add_argument("--head-size",
|
parser.add_argument("--head-size",
|
||||||
type=int,
|
type=int,
|
||||||
choices=[64, 80, 96, 112, 128, 256],
|
choices=[64, 80, 96, 112, 128, 192, 256],
|
||||||
default=128)
|
default=128)
|
||||||
parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
|
parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
|
||||||
parser.add_argument("--use-alibi", action="store_true")
|
parser.add_argument("--use-alibi", action="store_true")
|
||||||
@ -172,26 +180,30 @@ if __name__ == '__main__':
|
|||||||
default="half")
|
default="half")
|
||||||
parser.add_argument("--seed", type=int, default=0)
|
parser.add_argument("--seed", type=int, default=0)
|
||||||
parser.add_argument("--profile", action="store_true")
|
parser.add_argument("--profile", action="store_true")
|
||||||
|
parser.add_argument(
|
||||||
|
"--kv-cache-dtype",
|
||||||
|
type=str,
|
||||||
|
choices=["auto", "fp8", "fp8_e5m2", "fp8_e4m3"],
|
||||||
|
default="auto",
|
||||||
|
help="Data type for kv cache storage. If 'auto', will use model "
|
||||||
|
"data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. "
|
||||||
|
"ROCm (AMD GPU) supports fp8 (=fp8_e4m3)")
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
print(args)
|
print(args)
|
||||||
|
|
||||||
if args.num_query_heads % args.num_kv_heads != 0:
|
if args.num_query_heads % args.num_kv_heads != 0:
|
||||||
raise ValueError("num_query_heads must be divisible by num_kv_heads")
|
raise ValueError("num_query_heads must be divisible by num_kv_heads")
|
||||||
dtype_to_torch_dtype = {
|
|
||||||
"half": torch.half,
|
|
||||||
"bfloat16": torch.bfloat16,
|
|
||||||
"float": torch.float,
|
|
||||||
}
|
|
||||||
main(
|
main(
|
||||||
version=args.version,
|
version=args.version,
|
||||||
num_seqs=args.batch_size,
|
num_seqs=args.batch_size,
|
||||||
context_len=args.context_len,
|
seq_len=args.seq_len,
|
||||||
num_query_heads=args.num_query_heads,
|
num_query_heads=args.num_query_heads,
|
||||||
num_kv_heads=args.num_kv_heads,
|
num_kv_heads=args.num_kv_heads,
|
||||||
head_size=args.head_size,
|
head_size=args.head_size,
|
||||||
block_size=args.block_size,
|
block_size=args.block_size,
|
||||||
use_alibi=args.use_alibi,
|
use_alibi=args.use_alibi,
|
||||||
dtype=dtype_to_torch_dtype[args.dtype],
|
dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
|
||||||
seed=args.seed,
|
seed=args.seed,
|
||||||
do_profile=args.profile,
|
do_profile=args.profile,
|
||||||
|
kv_cache_dtype=args.kv_cache_dtype,
|
||||||
)
|
)
|
||||||
|
|||||||
121
benchmarks/kernels/benchmark_rope.py
Normal file
121
benchmarks/kernels/benchmark_rope.py
Normal file
@ -0,0 +1,121 @@
|
|||||||
|
import argparse
|
||||||
|
from itertools import accumulate
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import nvtx
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
||||||
|
|
||||||
|
|
||||||
|
def benchmark_rope_kernels_multi_lora(
|
||||||
|
is_neox_style: bool,
|
||||||
|
batch_size: int,
|
||||||
|
seq_len: int,
|
||||||
|
num_heads: int,
|
||||||
|
head_size: int,
|
||||||
|
rotary_dim: Optional[int],
|
||||||
|
dtype: torch.dtype,
|
||||||
|
seed: int,
|
||||||
|
device: str,
|
||||||
|
max_position: int = 8192,
|
||||||
|
base: int = 10000,
|
||||||
|
) -> None:
|
||||||
|
torch.random.manual_seed(seed)
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.manual_seed(seed)
|
||||||
|
torch.set_default_device(device)
|
||||||
|
if rotary_dim is None:
|
||||||
|
rotary_dim = head_size
|
||||||
|
# silulating serving 4 LoRAs
|
||||||
|
scaling_factors = [1, 2, 4, 8]
|
||||||
|
# batched RoPE can take multiple scaling factors
|
||||||
|
batched_rope = get_rope(head_size, rotary_dim, max_position, base,
|
||||||
|
is_neox_style, {
|
||||||
|
"type": "linear",
|
||||||
|
"factor": tuple(scaling_factors)
|
||||||
|
})
|
||||||
|
# non-batched RoPE takes only one scaling factor, we create multiple
|
||||||
|
# instances to simulate the same behavior
|
||||||
|
non_batched_ropes = []
|
||||||
|
for scaling_factor in scaling_factors:
|
||||||
|
non_batched_ropes.append(
|
||||||
|
get_rope(head_size, rotary_dim, max_position, base, is_neox_style,
|
||||||
|
{
|
||||||
|
"type": "linear",
|
||||||
|
"factor": (scaling_factor, )
|
||||||
|
}))
|
||||||
|
|
||||||
|
positions = torch.randint(0, max_position, (batch_size, seq_len))
|
||||||
|
query = torch.randn(batch_size,
|
||||||
|
seq_len,
|
||||||
|
num_heads * head_size,
|
||||||
|
dtype=dtype)
|
||||||
|
key = torch.randn_like(query)
|
||||||
|
|
||||||
|
# create query offsets for batched RoPE, we concat multiple kv cache
|
||||||
|
# together and each query needs to find the right kv cache of its type
|
||||||
|
offset_map = torch.tensor(
|
||||||
|
list(
|
||||||
|
accumulate([0] + [
|
||||||
|
max_position * scaling_factor * 2
|
||||||
|
for scaling_factor in scaling_factors[:-1]
|
||||||
|
])))
|
||||||
|
query_types = torch.randint(0,
|
||||||
|
len(scaling_factors), (batch_size, seq_len),
|
||||||
|
device=device)
|
||||||
|
# map query types to offsets
|
||||||
|
query_offsets = offset_map[query_types]
|
||||||
|
# the kernel takes flattened offsets
|
||||||
|
flatten_offsets = query_offsets.flatten()
|
||||||
|
|
||||||
|
# batched queries of the same type together for non-batched RoPE
|
||||||
|
queries = [query[query_types == i] for i in range(len(scaling_factors))]
|
||||||
|
keys = [key[query_types == i] for i in range(len(scaling_factors))]
|
||||||
|
packed_qkr = zip(queries, keys, non_batched_ropes)
|
||||||
|
# synchronize before start timing
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
with nvtx.annotate("non-batched", color="yellow"):
|
||||||
|
for q, k, r in packed_qkr:
|
||||||
|
r.forward(positions, q, k)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
with nvtx.annotate("batched", color="green"):
|
||||||
|
batched_rope.forward(positions, query, key, flatten_offsets)
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Benchmark the rotary embedding kernels.")
|
||||||
|
parser.add_argument("--is-neox-style", type=bool, default=True)
|
||||||
|
parser.add_argument("--batch-size", type=int, default=16)
|
||||||
|
parser.add_argument("--seq-len", type=int, default=512)
|
||||||
|
parser.add_argument("--num-heads", type=int, default=8)
|
||||||
|
parser.add_argument("--head-size",
|
||||||
|
type=int,
|
||||||
|
choices=[64, 80, 96, 112, 128, 192, 256],
|
||||||
|
default=128)
|
||||||
|
parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
|
||||||
|
parser.add_argument("--dtype",
|
||||||
|
type=str,
|
||||||
|
choices=["bfloat16", "float"],
|
||||||
|
default="float")
|
||||||
|
parser.add_argument("--seed", type=int, default=0)
|
||||||
|
parser.add_argument("--device",
|
||||||
|
type=str,
|
||||||
|
choices=["cuda:0", "cuda:1"],
|
||||||
|
default="cuda:0")
|
||||||
|
args = parser.parse_args()
|
||||||
|
print(args)
|
||||||
|
|
||||||
|
benchmark_rope_kernels_multi_lora(
|
||||||
|
is_neox_style=args.is_neox_style,
|
||||||
|
batch_size=args.batch_size,
|
||||||
|
seq_len=args.seq_len,
|
||||||
|
num_heads=args.num_heads,
|
||||||
|
head_size=args.head_size,
|
||||||
|
rotary_dim=args.rotary_dim,
|
||||||
|
dtype=getattr(torch, args.dtype),
|
||||||
|
seed=args.seed,
|
||||||
|
device=args.device,
|
||||||
|
)
|
||||||
75
benchmarks/kernels/benchmark_shapes.py
Normal file
75
benchmarks/kernels/benchmark_shapes.py
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
WEIGHT_SHAPES = {
|
||||||
|
"ideal": [[4 * 256 * 32, 256 * 32]],
|
||||||
|
"mistralai/Mistral-7B-v0.1/TP1": [
|
||||||
|
[4096, 6144],
|
||||||
|
[4096, 4096],
|
||||||
|
[4096, 28672],
|
||||||
|
[14336, 4096],
|
||||||
|
],
|
||||||
|
"mistralai/Mistral-7B-v0.1/TP2": [
|
||||||
|
[4096, 3072],
|
||||||
|
[2048, 4096],
|
||||||
|
[4096, 14336],
|
||||||
|
[7168, 4096],
|
||||||
|
],
|
||||||
|
"mistralai/Mistral-7B-v0.1/TP4": [
|
||||||
|
[4096, 1536],
|
||||||
|
[1024, 4096],
|
||||||
|
[4096, 7168],
|
||||||
|
[3584, 4096],
|
||||||
|
],
|
||||||
|
"meta-llama/Llama-2-7b-hf/TP1": [
|
||||||
|
[4096, 12288],
|
||||||
|
[4096, 4096],
|
||||||
|
[4096, 22016],
|
||||||
|
[11008, 4096],
|
||||||
|
],
|
||||||
|
"meta-llama/Llama-2-7b-hf/TP2": [
|
||||||
|
[4096, 6144],
|
||||||
|
[2048, 4096],
|
||||||
|
[4096, 11008],
|
||||||
|
[5504, 4096],
|
||||||
|
],
|
||||||
|
"meta-llama/Llama-2-7b-hf/TP4": [
|
||||||
|
[4096, 3072],
|
||||||
|
[1024, 4096],
|
||||||
|
[4096, 5504],
|
||||||
|
[2752, 4096],
|
||||||
|
],
|
||||||
|
"meta-llama/Llama-2-13b-hf/TP1": [
|
||||||
|
[5120, 15360],
|
||||||
|
[5120, 5120],
|
||||||
|
[5120, 27648],
|
||||||
|
[13824, 5120],
|
||||||
|
],
|
||||||
|
"meta-llama/Llama-2-13b-hf/TP2": [
|
||||||
|
[5120, 7680],
|
||||||
|
[2560, 5120],
|
||||||
|
[5120, 13824],
|
||||||
|
[6912, 5120],
|
||||||
|
],
|
||||||
|
"meta-llama/Llama-2-13b-hf/TP4": [
|
||||||
|
[5120, 3840],
|
||||||
|
[1280, 5120],
|
||||||
|
[5120, 6912],
|
||||||
|
[3456, 5120],
|
||||||
|
],
|
||||||
|
"meta-llama/Llama-2-70b-hf/TP1": [
|
||||||
|
[8192, 10240],
|
||||||
|
[8192, 8192],
|
||||||
|
[8192, 57344],
|
||||||
|
[28672, 8192],
|
||||||
|
],
|
||||||
|
"meta-llama/Llama-2-70b-hf/TP2": [
|
||||||
|
[8192, 5120],
|
||||||
|
[4096, 8192],
|
||||||
|
[8192, 28672],
|
||||||
|
[14336, 8192],
|
||||||
|
],
|
||||||
|
"meta-llama/Llama-2-70b-hf/TP4": [
|
||||||
|
[8192, 2560],
|
||||||
|
[2048, 8192],
|
||||||
|
[8192, 14336],
|
||||||
|
[7168, 8192],
|
||||||
|
],
|
||||||
|
}
|
||||||
@ -4,9 +4,9 @@ PORT=8000
|
|||||||
MODEL=$1
|
MODEL=$1
|
||||||
TOKENS=$2
|
TOKENS=$2
|
||||||
|
|
||||||
docker run --gpus all --shm-size 1g -p $PORT:80 \
|
docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \
|
||||||
-v $PWD/data:/data \
|
-v $PWD/data:/data \
|
||||||
ghcr.io/huggingface/text-generation-inference:0.8 \
|
ghcr.io/huggingface/text-generation-inference:1.4.0 \
|
||||||
--model-id $MODEL \
|
--model-id $MODEL \
|
||||||
--sharded false \
|
--sharded false \
|
||||||
--max-input-length 1024 \
|
--max-input-length 1024 \
|
||||||
|
|||||||
63
benchmarks/overheads/benchmark_hashing.py
Normal file
63
benchmarks/overheads/benchmark_hashing.py
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
import argparse
|
||||||
|
import cProfile
|
||||||
|
import pstats
|
||||||
|
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
|
# A very long prompt, total number of tokens is about 15k.
|
||||||
|
LONG_PROMPT = ["You are an expert in large language models, aren't you?"
|
||||||
|
] * 1000
|
||||||
|
LONG_PROMPT = ' '.join(LONG_PROMPT)
|
||||||
|
|
||||||
|
|
||||||
|
def main(args):
|
||||||
|
llm = LLM(
|
||||||
|
model=args.model,
|
||||||
|
enforce_eager=True,
|
||||||
|
enable_prefix_caching=True,
|
||||||
|
tensor_parallel_size=args.tensor_parallel_size,
|
||||||
|
use_v2_block_manager=args.use_v2_block_manager,
|
||||||
|
)
|
||||||
|
|
||||||
|
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
|
||||||
|
profiler = cProfile.Profile()
|
||||||
|
|
||||||
|
print("------warm up------")
|
||||||
|
for i in range(3):
|
||||||
|
output = llm.generate(LONG_PROMPT, sampling_params)
|
||||||
|
print(output[0].outputs[0].text)
|
||||||
|
|
||||||
|
print("------start generating------")
|
||||||
|
for i in range(3):
|
||||||
|
profiler.runctx('llm.generate(LONG_PROMPT, sampling_params)',
|
||||||
|
globals(), locals())
|
||||||
|
|
||||||
|
# analyze the runtime of hashing function
|
||||||
|
stats = pstats.Stats(profiler)
|
||||||
|
stats.sort_stats('cumulative')
|
||||||
|
total_time = 0
|
||||||
|
total_calls = 0
|
||||||
|
for func in stats.stats:
|
||||||
|
if 'hash_of_block' in func[2]:
|
||||||
|
total_time = stats.stats[func][3]
|
||||||
|
total_calls = stats.stats[func][0]
|
||||||
|
percentage = (total_time / stats.total_tt) * 100
|
||||||
|
print(f"Hashing took {total_time:.2f} seconds,"
|
||||||
|
f"{percentage:.2f}% of the total runtime.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description='Benchmark the performance of hashing function in'
|
||||||
|
'automatic prefix caching.')
|
||||||
|
parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k')
|
||||||
|
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
|
||||||
|
parser.add_argument('--output-len', type=int, default=10)
|
||||||
|
parser.add_argument('--enable-prefix-caching',
|
||||||
|
action='store_true',
|
||||||
|
help='enable prefix caching')
|
||||||
|
parser.add_argument('--use-v2-block-manager',
|
||||||
|
action='store_true',
|
||||||
|
help='Use BlockSpaceMangerV2')
|
||||||
|
args = parser.parse_args()
|
||||||
|
main(args)
|
||||||
518
benchmarks/sonnet.txt
Normal file
518
benchmarks/sonnet.txt
Normal file
@ -0,0 +1,518 @@
|
|||||||
|
FROM fairest creatures we desire increase,
|
||||||
|
That thereby beauty's rose might never die,
|
||||||
|
But as the riper should by time decease,
|
||||||
|
His tender heir might bear his memory:
|
||||||
|
But thou, contracted to thine own bright eyes,
|
||||||
|
Feed'st thy light'st flame with self-substantial fuel,
|
||||||
|
Making a famine where abundance lies,
|
||||||
|
Thyself thy foe, to thy sweet self too cruel.
|
||||||
|
Thou that art now the world's fresh ornament
|
||||||
|
And only herald to the gaudy spring,
|
||||||
|
Within thine own bud buriest thy content
|
||||||
|
And, tender churl, makest waste in niggarding.
|
||||||
|
Pity the world, or else this glutton be,
|
||||||
|
To eat the world's due, by the grave and thee.
|
||||||
|
When forty winters shall beseige thy brow,
|
||||||
|
And dig deep trenches in thy beauty's field,
|
||||||
|
Thy youth's proud livery, so gazed on now,
|
||||||
|
Will be a tatter'd weed, of small worth held:
|
||||||
|
Then being ask'd where all thy beauty lies,
|
||||||
|
Where all the treasure of thy lusty days,
|
||||||
|
To say, within thine own deep-sunken eyes,
|
||||||
|
Were an all-eating shame and thriftless praise.
|
||||||
|
How much more praise deserved thy beauty's use,
|
||||||
|
If thou couldst answer 'This fair child of mine
|
||||||
|
Shall sum my count and make my old excuse,'
|
||||||
|
Proving his beauty by succession thine!
|
||||||
|
This were to be new made when thou art old,
|
||||||
|
And see thy blood warm when thou feel'st it cold.
|
||||||
|
Look in thy glass, and tell the face thou viewest
|
||||||
|
Now is the time that face should form another;
|
||||||
|
Whose fresh repair if now thou not renewest,
|
||||||
|
Thou dost beguile the world, unbless some mother.
|
||||||
|
For where is she so fair whose unear'd womb
|
||||||
|
Disdains the tillage of thy husbandry?
|
||||||
|
Or who is he so fond will be the tomb
|
||||||
|
Of his self-love, to stop posterity?
|
||||||
|
Thou art thy mother's glass, and she in thee
|
||||||
|
Calls back the lovely April of her prime:
|
||||||
|
So thou through windows of thine age shall see
|
||||||
|
Despite of wrinkles this thy golden time.
|
||||||
|
But if thou live, remember'd not to be,
|
||||||
|
Die single, and thine image dies with thee.
|
||||||
|
Unthrifty loveliness, why dost thou spend
|
||||||
|
Upon thyself thy beauty's legacy?
|
||||||
|
Nature's bequest gives nothing but doth lend,
|
||||||
|
And being frank she lends to those are free.
|
||||||
|
Then, beauteous niggard, why dost thou abuse
|
||||||
|
The bounteous largess given thee to give?
|
||||||
|
Profitless usurer, why dost thou use
|
||||||
|
So great a sum of sums, yet canst not live?
|
||||||
|
For having traffic with thyself alone,
|
||||||
|
Thou of thyself thy sweet self dost deceive.
|
||||||
|
Then how, when nature calls thee to be gone,
|
||||||
|
What acceptable audit canst thou leave?
|
||||||
|
Thy unused beauty must be tomb'd with thee,
|
||||||
|
Which, used, lives th' executor to be.
|
||||||
|
Those hours, that with gentle work did frame
|
||||||
|
The lovely gaze where every eye doth dwell,
|
||||||
|
Will play the tyrants to the very same
|
||||||
|
And that unfair which fairly doth excel:
|
||||||
|
For never-resting time leads summer on
|
||||||
|
To hideous winter and confounds him there;
|
||||||
|
Sap cheque'd with frost and lusty leaves quite gone,
|
||||||
|
Beauty o'ersnow'd and bareness every where:
|
||||||
|
Then, were not summer's distillation left,
|
||||||
|
A liquid prisoner pent in walls of glass,
|
||||||
|
Beauty's effect with beauty were bereft,
|
||||||
|
Nor it nor no remembrance what it was:
|
||||||
|
But flowers distill'd though they with winter meet,
|
||||||
|
Leese but their show; their substance still lives sweet.
|
||||||
|
Then let not winter's ragged hand deface
|
||||||
|
In thee thy summer, ere thou be distill'd:
|
||||||
|
Make sweet some vial; treasure thou some place
|
||||||
|
With beauty's treasure, ere it be self-kill'd.
|
||||||
|
That use is not forbidden usury,
|
||||||
|
Which happies those that pay the willing loan;
|
||||||
|
That's for thyself to breed another thee,
|
||||||
|
Or ten times happier, be it ten for one;
|
||||||
|
Ten times thyself were happier than thou art,
|
||||||
|
If ten of thine ten times refigured thee:
|
||||||
|
Then what could death do, if thou shouldst depart,
|
||||||
|
Leaving thee living in posterity?
|
||||||
|
Be not self-will'd, for thou art much too fair
|
||||||
|
To be death's conquest and make worms thine heir.
|
||||||
|
Lo! in the orient when the gracious light
|
||||||
|
Lifts up his burning head, each under eye
|
||||||
|
Doth homage to his new-appearing sight,
|
||||||
|
Serving with looks his sacred majesty;
|
||||||
|
And having climb'd the steep-up heavenly hill,
|
||||||
|
Resembling strong youth in his middle age,
|
||||||
|
yet mortal looks adore his beauty still,
|
||||||
|
Attending on his golden pilgrimage;
|
||||||
|
But when from highmost pitch, with weary car,
|
||||||
|
Like feeble age, he reeleth from the day,
|
||||||
|
The eyes, 'fore duteous, now converted are
|
||||||
|
From his low tract and look another way:
|
||||||
|
So thou, thyself out-going in thy noon,
|
||||||
|
Unlook'd on diest, unless thou get a son.
|
||||||
|
Music to hear, why hear'st thou music sadly?
|
||||||
|
Sweets with sweets war not, joy delights in joy.
|
||||||
|
Why lovest thou that which thou receivest not gladly,
|
||||||
|
Or else receivest with pleasure thine annoy?
|
||||||
|
If the true concord of well-tuned sounds,
|
||||||
|
By unions married, do offend thine ear,
|
||||||
|
They do but sweetly chide thee, who confounds
|
||||||
|
In singleness the parts that thou shouldst bear.
|
||||||
|
Mark how one string, sweet husband to another,
|
||||||
|
Strikes each in each by mutual ordering,
|
||||||
|
Resembling sire and child and happy mother
|
||||||
|
Who all in one, one pleasing note do sing:
|
||||||
|
Whose speechless song, being many, seeming one,
|
||||||
|
Sings this to thee: 'thou single wilt prove none.'
|
||||||
|
Is it for fear to wet a widow's eye
|
||||||
|
That thou consumest thyself in single life?
|
||||||
|
Ah! if thou issueless shalt hap to die.
|
||||||
|
The world will wail thee, like a makeless wife;
|
||||||
|
The world will be thy widow and still weep
|
||||||
|
That thou no form of thee hast left behind,
|
||||||
|
When every private widow well may keep
|
||||||
|
By children's eyes her husband's shape in mind.
|
||||||
|
Look, what an unthrift in the world doth spend
|
||||||
|
Shifts but his place, for still the world enjoys it;
|
||||||
|
But beauty's waste hath in the world an end,
|
||||||
|
And kept unused, the user so destroys it.
|
||||||
|
No love toward others in that bosom sits
|
||||||
|
That on himself such murderous shame commits.
|
||||||
|
For shame! deny that thou bear'st love to any,
|
||||||
|
Who for thyself art so unprovident.
|
||||||
|
Grant, if thou wilt, thou art beloved of many,
|
||||||
|
But that thou none lovest is most evident;
|
||||||
|
For thou art so possess'd with murderous hate
|
||||||
|
That 'gainst thyself thou stick'st not to conspire.
|
||||||
|
Seeking that beauteous roof to ruinate
|
||||||
|
Which to repair should be thy chief desire.
|
||||||
|
O, change thy thought, that I may change my mind!
|
||||||
|
Shall hate be fairer lodged than gentle love?
|
||||||
|
Be, as thy presence is, gracious and kind,
|
||||||
|
Or to thyself at least kind-hearted prove:
|
||||||
|
Make thee another self, for love of me,
|
||||||
|
That beauty still may live in thine or thee.
|
||||||
|
As fast as thou shalt wane, so fast thou growest
|
||||||
|
In one of thine, from that which thou departest;
|
||||||
|
And that fresh blood which youngly thou bestowest
|
||||||
|
Thou mayst call thine when thou from youth convertest.
|
||||||
|
Herein lives wisdom, beauty and increase:
|
||||||
|
Without this, folly, age and cold decay:
|
||||||
|
If all were minded so, the times should cease
|
||||||
|
And threescore year would make the world away.
|
||||||
|
Let those whom Nature hath not made for store,
|
||||||
|
Harsh featureless and rude, barrenly perish:
|
||||||
|
Look, whom she best endow'd she gave the more;
|
||||||
|
Which bounteous gift thou shouldst in bounty cherish:
|
||||||
|
She carved thee for her seal, and meant thereby
|
||||||
|
Thou shouldst print more, not let that copy die.
|
||||||
|
When I do count the clock that tells the time,
|
||||||
|
And see the brave day sunk in hideous night;
|
||||||
|
When I behold the violet past prime,
|
||||||
|
And sable curls all silver'd o'er with white;
|
||||||
|
When lofty trees I see barren of leaves
|
||||||
|
Which erst from heat did canopy the herd,
|
||||||
|
And summer's green all girded up in sheaves
|
||||||
|
Borne on the bier with white and bristly beard,
|
||||||
|
Then of thy beauty do I question make,
|
||||||
|
That thou among the wastes of time must go,
|
||||||
|
Since sweets and beauties do themselves forsake
|
||||||
|
And die as fast as they see others grow;
|
||||||
|
And nothing 'gainst Time's scythe can make defence
|
||||||
|
Save breed, to brave him when he takes thee hence.
|
||||||
|
O, that you were yourself! but, love, you are
|
||||||
|
No longer yours than you yourself here live:
|
||||||
|
Against this coming end you should prepare,
|
||||||
|
And your sweet semblance to some other give.
|
||||||
|
So should that beauty which you hold in lease
|
||||||
|
Find no determination: then you were
|
||||||
|
Yourself again after yourself's decease,
|
||||||
|
When your sweet issue your sweet form should bear.
|
||||||
|
Who lets so fair a house fall to decay,
|
||||||
|
Which husbandry in honour might uphold
|
||||||
|
Against the stormy gusts of winter's day
|
||||||
|
And barren rage of death's eternal cold?
|
||||||
|
O, none but unthrifts! Dear my love, you know
|
||||||
|
You had a father: let your son say so.
|
||||||
|
Not from the stars do I my judgment pluck;
|
||||||
|
And yet methinks I have astronomy,
|
||||||
|
But not to tell of good or evil luck,
|
||||||
|
Of plagues, of dearths, or seasons' quality;
|
||||||
|
Nor can I fortune to brief minutes tell,
|
||||||
|
Pointing to each his thunder, rain and wind,
|
||||||
|
Or say with princes if it shall go well,
|
||||||
|
By oft predict that I in heaven find:
|
||||||
|
But from thine eyes my knowledge I derive,
|
||||||
|
And, constant stars, in them I read such art
|
||||||
|
As truth and beauty shall together thrive,
|
||||||
|
If from thyself to store thou wouldst convert;
|
||||||
|
Or else of thee this I prognosticate:
|
||||||
|
Thy end is truth's and beauty's doom and date.
|
||||||
|
When I consider every thing that grows
|
||||||
|
Holds in perfection but a little moment,
|
||||||
|
That this huge stage presenteth nought but shows
|
||||||
|
Whereon the stars in secret influence comment;
|
||||||
|
When I perceive that men as plants increase,
|
||||||
|
Cheered and cheque'd even by the self-same sky,
|
||||||
|
Vaunt in their youthful sap, at height decrease,
|
||||||
|
And wear their brave state out of memory;
|
||||||
|
Then the conceit of this inconstant stay
|
||||||
|
Sets you most rich in youth before my sight,
|
||||||
|
Where wasteful Time debateth with Decay,
|
||||||
|
To change your day of youth to sullied night;
|
||||||
|
And all in war with Time for love of you,
|
||||||
|
As he takes from you, I engraft you new.
|
||||||
|
But wherefore do not you a mightier way
|
||||||
|
Make war upon this bloody tyrant, Time?
|
||||||
|
And fortify yourself in your decay
|
||||||
|
With means more blessed than my barren rhyme?
|
||||||
|
Now stand you on the top of happy hours,
|
||||||
|
And many maiden gardens yet unset
|
||||||
|
With virtuous wish would bear your living flowers,
|
||||||
|
Much liker than your painted counterfeit:
|
||||||
|
So should the lines of life that life repair,
|
||||||
|
Which this, Time's pencil, or my pupil pen,
|
||||||
|
Neither in inward worth nor outward fair,
|
||||||
|
Can make you live yourself in eyes of men.
|
||||||
|
To give away yourself keeps yourself still,
|
||||||
|
And you must live, drawn by your own sweet skill.
|
||||||
|
Who will believe my verse in time to come,
|
||||||
|
If it were fill'd with your most high deserts?
|
||||||
|
Though yet, heaven knows, it is but as a tomb
|
||||||
|
Which hides your life and shows not half your parts.
|
||||||
|
If I could write the beauty of your eyes
|
||||||
|
And in fresh numbers number all your graces,
|
||||||
|
The age to come would say 'This poet lies:
|
||||||
|
Such heavenly touches ne'er touch'd earthly faces.'
|
||||||
|
So should my papers yellow'd with their age
|
||||||
|
Be scorn'd like old men of less truth than tongue,
|
||||||
|
And your true rights be term'd a poet's rage
|
||||||
|
And stretched metre of an antique song:
|
||||||
|
But were some child of yours alive that time,
|
||||||
|
You should live twice; in it and in my rhyme.
|
||||||
|
Shall I compare thee to a summer's day?
|
||||||
|
Thou art more lovely and more temperate:
|
||||||
|
Rough winds do shake the darling buds of May,
|
||||||
|
And summer's lease hath all too short a date:
|
||||||
|
Sometime too hot the eye of heaven shines,
|
||||||
|
And often is his gold complexion dimm'd;
|
||||||
|
And every fair from fair sometime declines,
|
||||||
|
By chance or nature's changing course untrimm'd;
|
||||||
|
But thy eternal summer shall not fade
|
||||||
|
Nor lose possession of that fair thou owest;
|
||||||
|
Nor shall Death brag thou wander'st in his shade,
|
||||||
|
When in eternal lines to time thou growest:
|
||||||
|
So long as men can breathe or eyes can see,
|
||||||
|
So long lives this and this gives life to thee.
|
||||||
|
Devouring Time, blunt thou the lion's paws,
|
||||||
|
And make the earth devour her own sweet brood;
|
||||||
|
Pluck the keen teeth from the fierce tiger's jaws,
|
||||||
|
And burn the long-lived phoenix in her blood;
|
||||||
|
Make glad and sorry seasons as thou fleets,
|
||||||
|
And do whate'er thou wilt, swift-footed Time,
|
||||||
|
To the wide world and all her fading sweets;
|
||||||
|
But I forbid thee one most heinous crime:
|
||||||
|
O, carve not with thy hours my love's fair brow,
|
||||||
|
Nor draw no lines there with thine antique pen;
|
||||||
|
Him in thy course untainted do allow
|
||||||
|
For beauty's pattern to succeeding men.
|
||||||
|
Yet, do thy worst, old Time: despite thy wrong,
|
||||||
|
My love shall in my verse ever live young.
|
||||||
|
A woman's face with Nature's own hand painted
|
||||||
|
Hast thou, the master-mistress of my passion;
|
||||||
|
A woman's gentle heart, but not acquainted
|
||||||
|
With shifting change, as is false women's fashion;
|
||||||
|
An eye more bright than theirs, less false in rolling,
|
||||||
|
Gilding the object whereupon it gazeth;
|
||||||
|
A man in hue, all 'hues' in his controlling,
|
||||||
|
Much steals men's eyes and women's souls amazeth.
|
||||||
|
And for a woman wert thou first created;
|
||||||
|
Till Nature, as she wrought thee, fell a-doting,
|
||||||
|
And by addition me of thee defeated,
|
||||||
|
By adding one thing to my purpose nothing.
|
||||||
|
But since she prick'd thee out for women's pleasure,
|
||||||
|
Mine be thy love and thy love's use their treasure.
|
||||||
|
So is it not with me as with that Muse
|
||||||
|
Stirr'd by a painted beauty to his verse,
|
||||||
|
Who heaven itself for ornament doth use
|
||||||
|
And every fair with his fair doth rehearse
|
||||||
|
Making a couplement of proud compare,
|
||||||
|
With sun and moon, with earth and sea's rich gems,
|
||||||
|
With April's first-born flowers, and all things rare
|
||||||
|
That heaven's air in this huge rondure hems.
|
||||||
|
O' let me, true in love, but truly write,
|
||||||
|
And then believe me, my love is as fair
|
||||||
|
As any mother's child, though not so bright
|
||||||
|
As those gold candles fix'd in heaven's air:
|
||||||
|
Let them say more than like of hearsay well;
|
||||||
|
I will not praise that purpose not to sell.
|
||||||
|
My glass shall not persuade me I am old,
|
||||||
|
So long as youth and thou are of one date;
|
||||||
|
But when in thee time's furrows I behold,
|
||||||
|
Then look I death my days should expiate.
|
||||||
|
For all that beauty that doth cover thee
|
||||||
|
Is but the seemly raiment of my heart,
|
||||||
|
Which in thy breast doth live, as thine in me:
|
||||||
|
How can I then be elder than thou art?
|
||||||
|
O, therefore, love, be of thyself so wary
|
||||||
|
As I, not for myself, but for thee will;
|
||||||
|
Bearing thy heart, which I will keep so chary
|
||||||
|
As tender nurse her babe from faring ill.
|
||||||
|
Presume not on thy heart when mine is slain;
|
||||||
|
Thou gavest me thine, not to give back again.
|
||||||
|
As an unperfect actor on the stage
|
||||||
|
Who with his fear is put besides his part,
|
||||||
|
Or some fierce thing replete with too much rage,
|
||||||
|
Whose strength's abundance weakens his own heart.
|
||||||
|
So I, for fear of trust, forget to say
|
||||||
|
The perfect ceremony of love's rite,
|
||||||
|
And in mine own love's strength seem to decay,
|
||||||
|
O'ercharged with burden of mine own love's might.
|
||||||
|
O, let my books be then the eloquence
|
||||||
|
And dumb presagers of my speaking breast,
|
||||||
|
Who plead for love and look for recompense
|
||||||
|
More than that tongue that more hath more express'd.
|
||||||
|
O, learn to read what silent love hath writ:
|
||||||
|
To hear with eyes belongs to love's fine wit.
|
||||||
|
Mine eye hath play'd the painter and hath stell'd
|
||||||
|
Thy beauty's form in table of my heart;
|
||||||
|
My body is the frame wherein 'tis held,
|
||||||
|
And perspective it is the painter's art.
|
||||||
|
For through the painter must you see his skill,
|
||||||
|
To find where your true image pictured lies;
|
||||||
|
Which in my bosom's shop is hanging still,
|
||||||
|
That hath his windows glazed with thine eyes.
|
||||||
|
Now see what good turns eyes for eyes have done:
|
||||||
|
Mine eyes have drawn thy shape, and thine for me
|
||||||
|
Are windows to my breast, where-through the sun
|
||||||
|
Delights to peep, to gaze therein on thee;
|
||||||
|
Yet eyes this cunning want to grace their art;
|
||||||
|
They draw but what they see, know not the heart.
|
||||||
|
Let those who are in favour with their stars
|
||||||
|
Of public honour and proud titles boast,
|
||||||
|
Whilst I, whom fortune of such triumph bars,
|
||||||
|
Unlook'd for joy in that I honour most.
|
||||||
|
Great princes' favourites their fair leaves spread
|
||||||
|
But as the marigold at the sun's eye,
|
||||||
|
And in themselves their pride lies buried,
|
||||||
|
For at a frown they in their glory die.
|
||||||
|
The painful warrior famoused for fight,
|
||||||
|
After a thousand victories once foil'd,
|
||||||
|
Is from the book of honour razed quite,
|
||||||
|
And all the rest forgot for which he toil'd:
|
||||||
|
Then happy I, that love and am beloved
|
||||||
|
Where I may not remove nor be removed.
|
||||||
|
Lord of my love, to whom in vassalage
|
||||||
|
Thy merit hath my duty strongly knit,
|
||||||
|
To thee I send this written embassage,
|
||||||
|
To witness duty, not to show my wit:
|
||||||
|
Duty so great, which wit so poor as mine
|
||||||
|
May make seem bare, in wanting words to show it,
|
||||||
|
But that I hope some good conceit of thine
|
||||||
|
In thy soul's thought, all naked, will bestow it;
|
||||||
|
Till whatsoever star that guides my moving
|
||||||
|
Points on me graciously with fair aspect
|
||||||
|
And puts apparel on my tatter'd loving,
|
||||||
|
To show me worthy of thy sweet respect:
|
||||||
|
Then may I dare to boast how I do love thee;
|
||||||
|
Till then not show my head where thou mayst prove me.
|
||||||
|
Weary with toil, I haste me to my bed,
|
||||||
|
The dear repose for limbs with travel tired;
|
||||||
|
But then begins a journey in my head,
|
||||||
|
To work my mind, when body's work's expired:
|
||||||
|
For then my thoughts, from far where I abide,
|
||||||
|
Intend a zealous pilgrimage to thee,
|
||||||
|
And keep my drooping eyelids open wide,
|
||||||
|
Looking on darkness which the blind do see
|
||||||
|
Save that my soul's imaginary sight
|
||||||
|
Presents thy shadow to my sightless view,
|
||||||
|
Which, like a jewel hung in ghastly night,
|
||||||
|
Makes black night beauteous and her old face new.
|
||||||
|
Lo! thus, by day my limbs, by night my mind,
|
||||||
|
For thee and for myself no quiet find.
|
||||||
|
How can I then return in happy plight,
|
||||||
|
That am debarr'd the benefit of rest?
|
||||||
|
When day's oppression is not eased by night,
|
||||||
|
But day by night, and night by day, oppress'd?
|
||||||
|
And each, though enemies to either's reign,
|
||||||
|
Do in consent shake hands to torture me;
|
||||||
|
The one by toil, the other to complain
|
||||||
|
How far I toil, still farther off from thee.
|
||||||
|
I tell the day, to please them thou art bright
|
||||||
|
And dost him grace when clouds do blot the heaven:
|
||||||
|
So flatter I the swart-complexion'd night,
|
||||||
|
When sparkling stars twire not thou gild'st the even.
|
||||||
|
But day doth daily draw my sorrows longer
|
||||||
|
And night doth nightly make grief's strength seem stronger.
|
||||||
|
When, in disgrace with fortune and men's eyes,
|
||||||
|
I all alone beweep my outcast state
|
||||||
|
And trouble deal heaven with my bootless cries
|
||||||
|
And look upon myself and curse my fate,
|
||||||
|
Wishing me like to one more rich in hope,
|
||||||
|
Featured like him, like him with friends possess'd,
|
||||||
|
Desiring this man's art and that man's scope,
|
||||||
|
With what I most enjoy contented least;
|
||||||
|
Yet in these thoughts myself almost despising,
|
||||||
|
Haply I think on thee, and then my state,
|
||||||
|
Like to the lark at break of day arising
|
||||||
|
From sullen earth, sings hymns at heaven's gate;
|
||||||
|
For thy sweet love remember'd such wealth brings
|
||||||
|
That then I scorn to change my state with kings.
|
||||||
|
When to the sessions of sweet silent thought
|
||||||
|
I summon up remembrance of things past,
|
||||||
|
I sigh the lack of many a thing I sought,
|
||||||
|
And with old woes new wail my dear time's waste:
|
||||||
|
Then can I drown an eye, unused to flow,
|
||||||
|
For precious friends hid in death's dateless night,
|
||||||
|
And weep afresh love's long since cancell'd woe,
|
||||||
|
And moan the expense of many a vanish'd sight:
|
||||||
|
Then can I grieve at grievances foregone,
|
||||||
|
And heavily from woe to woe tell o'er
|
||||||
|
The sad account of fore-bemoaned moan,
|
||||||
|
Which I new pay as if not paid before.
|
||||||
|
But if the while I think on thee, dear friend,
|
||||||
|
All losses are restored and sorrows end.
|
||||||
|
Thy bosom is endeared with all hearts,
|
||||||
|
Which I by lacking have supposed dead,
|
||||||
|
And there reigns love and all love's loving parts,
|
||||||
|
And all those friends which I thought buried.
|
||||||
|
How many a holy and obsequious tear
|
||||||
|
Hath dear religious love stol'n from mine eye
|
||||||
|
As interest of the dead, which now appear
|
||||||
|
But things removed that hidden in thee lie!
|
||||||
|
Thou art the grave where buried love doth live,
|
||||||
|
Hung with the trophies of my lovers gone,
|
||||||
|
Who all their parts of me to thee did give;
|
||||||
|
That due of many now is thine alone:
|
||||||
|
Their images I loved I view in thee,
|
||||||
|
And thou, all they, hast all the all of me.
|
||||||
|
If thou survive my well-contented day,
|
||||||
|
When that churl Death my bones with dust shall cover,
|
||||||
|
And shalt by fortune once more re-survey
|
||||||
|
These poor rude lines of thy deceased lover,
|
||||||
|
Compare them with the bettering of the time,
|
||||||
|
And though they be outstripp'd by every pen,
|
||||||
|
Reserve them for my love, not for their rhyme,
|
||||||
|
Exceeded by the height of happier men.
|
||||||
|
O, then vouchsafe me but this loving thought:
|
||||||
|
'Had my friend's Muse grown with this growing age,
|
||||||
|
A dearer birth than this his love had brought,
|
||||||
|
To march in ranks of better equipage:
|
||||||
|
But since he died and poets better prove,
|
||||||
|
Theirs for their style I'll read, his for his love.'
|
||||||
|
Full many a glorious morning have I seen
|
||||||
|
Flatter the mountain-tops with sovereign eye,
|
||||||
|
Kissing with golden face the meadows green,
|
||||||
|
Gilding pale streams with heavenly alchemy;
|
||||||
|
Anon permit the basest clouds to ride
|
||||||
|
With ugly rack on his celestial face,
|
||||||
|
And from the forlorn world his visage hide,
|
||||||
|
Stealing unseen to west with this disgrace:
|
||||||
|
Even so my sun one early morn did shine
|
||||||
|
With all triumphant splendor on my brow;
|
||||||
|
But out, alack! he was but one hour mine;
|
||||||
|
The region cloud hath mask'd him from me now.
|
||||||
|
Yet him for this my love no whit disdaineth;
|
||||||
|
Suns of the world may stain when heaven's sun staineth.
|
||||||
|
Why didst thou promise such a beauteous day,
|
||||||
|
And make me travel forth without my cloak,
|
||||||
|
To let base clouds o'ertake me in my way,
|
||||||
|
Hiding thy bravery in their rotten smoke?
|
||||||
|
'Tis not enough that through the cloud thou break,
|
||||||
|
To dry the rain on my storm-beaten face,
|
||||||
|
For no man well of such a salve can speak
|
||||||
|
That heals the wound and cures not the disgrace:
|
||||||
|
Nor can thy shame give physic to my grief;
|
||||||
|
Though thou repent, yet I have still the loss:
|
||||||
|
The offender's sorrow lends but weak relief
|
||||||
|
To him that bears the strong offence's cross.
|
||||||
|
Ah! but those tears are pearl which thy love sheds,
|
||||||
|
And they are rich and ransom all ill deeds.
|
||||||
|
No more be grieved at that which thou hast done:
|
||||||
|
Roses have thorns, and silver fountains mud;
|
||||||
|
Clouds and eclipses stain both moon and sun,
|
||||||
|
And loathsome canker lives in sweetest bud.
|
||||||
|
All men make faults, and even I in this,
|
||||||
|
Authorizing thy trespass with compare,
|
||||||
|
Myself corrupting, salving thy amiss,
|
||||||
|
Excusing thy sins more than thy sins are;
|
||||||
|
For to thy sensual fault I bring in sense--
|
||||||
|
Thy adverse party is thy advocate--
|
||||||
|
And 'gainst myself a lawful plea commence:
|
||||||
|
Such civil war is in my love and hate
|
||||||
|
That I an accessary needs must be
|
||||||
|
To that sweet thief which sourly robs from me.
|
||||||
|
Let me confess that we two must be twain,
|
||||||
|
Although our undivided loves are one:
|
||||||
|
So shall those blots that do with me remain
|
||||||
|
Without thy help by me be borne alone.
|
||||||
|
In our two loves there is but one respect,
|
||||||
|
Though in our lives a separable spite,
|
||||||
|
Which though it alter not love's sole effect,
|
||||||
|
Yet doth it steal sweet hours from love's delight.
|
||||||
|
I may not evermore acknowledge thee,
|
||||||
|
Lest my bewailed guilt should do thee shame,
|
||||||
|
Nor thou with public kindness honour me,
|
||||||
|
Unless thou take that honour from thy name:
|
||||||
|
But do not so; I love thee in such sort
|
||||||
|
As, thou being mine, mine is thy good report.
|
||||||
|
As a decrepit father takes delight
|
||||||
|
To see his active child do deeds of youth,
|
||||||
|
So I, made lame by fortune's dearest spite,
|
||||||
|
Take all my comfort of thy worth and truth.
|
||||||
|
For whether beauty, birth, or wealth, or wit,
|
||||||
|
Or any of these all, or all, or more,
|
||||||
|
Entitled in thy parts do crowned sit,
|
||||||
|
I make my love engrafted to this store:
|
||||||
|
So then I am not lame, poor, nor despised,
|
||||||
|
Whilst that this shadow doth such substance give
|
||||||
|
That I in thy abundance am sufficed
|
||||||
|
And by a part of all thy glory live.
|
||||||
|
Look, what is best, that best I wish in thee:
|
||||||
|
This wish I have; then ten times happy me!
|
||||||
90
cmake/cpu_extension.cmake
Normal file
90
cmake/cpu_extension.cmake
Normal file
@ -0,0 +1,90 @@
|
|||||||
|
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||||
|
|
||||||
|
#
|
||||||
|
# Define environment variables for special configurations
|
||||||
|
#
|
||||||
|
if(DEFINED ENV{VLLM_CPU_AVX512BF16})
|
||||||
|
set(ENABLE_AVX512BF16 ON)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
include_directories("${CMAKE_SOURCE_DIR}/csrc")
|
||||||
|
|
||||||
|
#
|
||||||
|
# Check the compile flags
|
||||||
|
#
|
||||||
|
list(APPEND CXX_COMPILE_FLAGS
|
||||||
|
"-fopenmp"
|
||||||
|
"-DVLLM_CPU_EXTENSION")
|
||||||
|
|
||||||
|
execute_process(COMMAND cat /proc/cpuinfo
|
||||||
|
RESULT_VARIABLE CPUINFO_RET
|
||||||
|
OUTPUT_VARIABLE CPUINFO)
|
||||||
|
|
||||||
|
if (NOT CPUINFO_RET EQUAL 0)
|
||||||
|
message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
function (find_isa CPUINFO TARGET OUT)
|
||||||
|
string(FIND ${CPUINFO} ${TARGET} ISA_FOUND)
|
||||||
|
if(NOT ISA_FOUND EQUAL -1)
|
||||||
|
set(${OUT} ON PARENT_SCOPE)
|
||||||
|
else()
|
||||||
|
set(${OUT} OFF PARENT_SCOPE)
|
||||||
|
endif()
|
||||||
|
endfunction()
|
||||||
|
|
||||||
|
find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
|
||||||
|
|
||||||
|
if (AVX512_FOUND)
|
||||||
|
list(APPEND CXX_COMPILE_FLAGS
|
||||||
|
"-mavx512f"
|
||||||
|
"-mavx512vl"
|
||||||
|
"-mavx512bw"
|
||||||
|
"-mavx512dq")
|
||||||
|
|
||||||
|
find_isa(${CPUINFO} "avx512_bf16" AVX512BF16_FOUND)
|
||||||
|
if (AVX512BF16_FOUND OR ENABLE_AVX512BF16)
|
||||||
|
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
|
||||||
|
CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
|
||||||
|
list(APPEND CXX_COMPILE_FLAGS "-mavx512bf16")
|
||||||
|
else()
|
||||||
|
message(WARNING "Disable AVX512-BF16 ISA support, requires gcc/g++ >= 12.3")
|
||||||
|
endif()
|
||||||
|
else()
|
||||||
|
message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.")
|
||||||
|
endif()
|
||||||
|
else()
|
||||||
|
message(FATAL_ERROR "vLLM CPU backend requires AVX512 ISA support.")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# Define extension targets
|
||||||
|
#
|
||||||
|
|
||||||
|
#
|
||||||
|
# _C extension
|
||||||
|
#
|
||||||
|
set(VLLM_EXT_SRC
|
||||||
|
"csrc/cpu/activation.cpp"
|
||||||
|
"csrc/cpu/attention.cpp"
|
||||||
|
"csrc/cpu/cache.cpp"
|
||||||
|
"csrc/cpu/layernorm.cpp"
|
||||||
|
"csrc/cpu/pos_encoding.cpp"
|
||||||
|
"csrc/cpu/torch_bindings.cpp")
|
||||||
|
|
||||||
|
define_gpu_extension_target(
|
||||||
|
_C
|
||||||
|
DESTINATION vllm
|
||||||
|
LANGUAGE CXX
|
||||||
|
SOURCES ${VLLM_EXT_SRC}
|
||||||
|
COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
|
||||||
|
USE_SABI 3
|
||||||
|
WITH_SOABI
|
||||||
|
)
|
||||||
|
|
||||||
|
add_custom_target(default)
|
||||||
|
message(STATUS "Enabling C extension.")
|
||||||
|
add_dependencies(default _C)
|
||||||
73
cmake/hipify.py
Executable file
73
cmake/hipify.py
Executable file
@ -0,0 +1,73 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
#
|
||||||
|
# A command line tool for running pytorch's hipify preprocessor on CUDA
|
||||||
|
# source files.
|
||||||
|
#
|
||||||
|
# See https://github.com/ROCm/hipify_torch
|
||||||
|
# and <torch install dir>/utils/hipify/hipify_python.py
|
||||||
|
#
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
from torch.utils.hipify.hipify_python import hipify
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
|
# Project directory where all the source + include files live.
|
||||||
|
parser.add_argument(
|
||||||
|
"-p",
|
||||||
|
"--project_dir",
|
||||||
|
help="The project directory.",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Directory where hipified files are written.
|
||||||
|
parser.add_argument(
|
||||||
|
"-o",
|
||||||
|
"--output_dir",
|
||||||
|
help="The output directory.",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Source files to convert.
|
||||||
|
parser.add_argument("sources",
|
||||||
|
help="Source files to hipify.",
|
||||||
|
nargs="*",
|
||||||
|
default=[])
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Limit include scope to project_dir only
|
||||||
|
includes = [os.path.join(args.project_dir, '*')]
|
||||||
|
|
||||||
|
# Get absolute path for all source files.
|
||||||
|
extra_files = [os.path.abspath(s) for s in args.sources]
|
||||||
|
|
||||||
|
# Copy sources from project directory to output directory.
|
||||||
|
# The directory might already exist to hold object files so we ignore that.
|
||||||
|
shutil.copytree(args.project_dir, args.output_dir, dirs_exist_ok=True)
|
||||||
|
|
||||||
|
hipify_result = hipify(project_directory=args.project_dir,
|
||||||
|
output_directory=args.output_dir,
|
||||||
|
header_include_dirs=[],
|
||||||
|
includes=includes,
|
||||||
|
extra_files=extra_files,
|
||||||
|
show_detailed=True,
|
||||||
|
is_pytorch_extension=True,
|
||||||
|
hipify_extra_files_only=True)
|
||||||
|
|
||||||
|
hipified_sources = []
|
||||||
|
for source in args.sources:
|
||||||
|
s_abs = os.path.abspath(source)
|
||||||
|
hipified_s_abs = (hipify_result[s_abs].hipified_path if
|
||||||
|
(s_abs in hipify_result
|
||||||
|
and hipify_result[s_abs].hipified_path is not None)
|
||||||
|
else s_abs)
|
||||||
|
hipified_sources.append(hipified_s_abs)
|
||||||
|
|
||||||
|
assert (len(hipified_sources) == len(args.sources))
|
||||||
|
|
||||||
|
# Print hipified source files.
|
||||||
|
print("\n".join(hipified_sources))
|
||||||
359
cmake/utils.cmake
Normal file
359
cmake/utils.cmake
Normal file
@ -0,0 +1,359 @@
|
|||||||
|
#
|
||||||
|
# Attempt to find the python package that uses the same python executable as
|
||||||
|
# `EXECUTABLE` and is one of the `SUPPORTED_VERSIONS`.
|
||||||
|
#
|
||||||
|
macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)
|
||||||
|
file(REAL_PATH ${EXECUTABLE} EXECUTABLE)
|
||||||
|
set(Python_EXECUTABLE ${EXECUTABLE})
|
||||||
|
find_package(Python COMPONENTS Interpreter Development.Module Development.SABIModule)
|
||||||
|
if (NOT Python_FOUND)
|
||||||
|
message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
|
||||||
|
endif()
|
||||||
|
set(_VER "${Python_VERSION_MAJOR}.${Python_VERSION_MINOR}")
|
||||||
|
set(_SUPPORTED_VERSIONS_LIST ${SUPPORTED_VERSIONS} ${ARGN})
|
||||||
|
if (NOT _VER IN_LIST _SUPPORTED_VERSIONS_LIST)
|
||||||
|
message(FATAL_ERROR
|
||||||
|
"Python version (${_VER}) is not one of the supported versions: "
|
||||||
|
"${_SUPPORTED_VERSIONS_LIST}.")
|
||||||
|
endif()
|
||||||
|
message(STATUS "Found python matching: ${EXECUTABLE}.")
|
||||||
|
endmacro()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Run `EXPR` in python. The standard output of python is stored in `OUT` and
|
||||||
|
# has trailing whitespace stripped. If an error is encountered when running
|
||||||
|
# python, a fatal message `ERR_MSG` is issued.
|
||||||
|
#
|
||||||
|
function (run_python OUT EXPR ERR_MSG)
|
||||||
|
execute_process(
|
||||||
|
COMMAND
|
||||||
|
"${Python_EXECUTABLE}" "-c" "${EXPR}"
|
||||||
|
OUTPUT_VARIABLE PYTHON_OUT
|
||||||
|
RESULT_VARIABLE PYTHON_ERROR_CODE
|
||||||
|
ERROR_VARIABLE PYTHON_STDERR
|
||||||
|
OUTPUT_STRIP_TRAILING_WHITESPACE)
|
||||||
|
|
||||||
|
if(NOT PYTHON_ERROR_CODE EQUAL 0)
|
||||||
|
message(FATAL_ERROR "${ERR_MSG}: ${PYTHON_STDERR}")
|
||||||
|
endif()
|
||||||
|
set(${OUT} ${PYTHON_OUT} PARENT_SCOPE)
|
||||||
|
endfunction()
|
||||||
|
|
||||||
|
# Run `EXPR` in python after importing `PKG`. Use the result of this to extend
|
||||||
|
# `CMAKE_PREFIX_PATH` so the torch cmake configuration can be imported.
|
||||||
|
macro (append_cmake_prefix_path PKG EXPR)
|
||||||
|
run_python(_PREFIX_PATH
|
||||||
|
"import ${PKG}; print(${EXPR})" "Failed to locate ${PKG} path")
|
||||||
|
list(APPEND CMAKE_PREFIX_PATH ${_PREFIX_PATH})
|
||||||
|
endmacro()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Add a target named `hipify${NAME}` that runs the hipify preprocessor on a set
|
||||||
|
# of CUDA source files. The names of the corresponding "hipified" sources are
|
||||||
|
# stored in `OUT_SRCS`.
|
||||||
|
#
|
||||||
|
function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
|
||||||
|
#
|
||||||
|
# Split into C++ and non-C++ (i.e. CUDA) sources.
|
||||||
|
#
|
||||||
|
set(SRCS ${ORIG_SRCS})
|
||||||
|
set(CXX_SRCS ${ORIG_SRCS})
|
||||||
|
list(FILTER SRCS EXCLUDE REGEX "\.(cc)|(cpp)$")
|
||||||
|
list(FILTER CXX_SRCS INCLUDE REGEX "\.(cc)|(cpp)$")
|
||||||
|
|
||||||
|
#
|
||||||
|
# Generate ROCm/HIP source file names from CUDA file names.
|
||||||
|
# Since HIP files are generated code, they will appear in the build area
|
||||||
|
# `CMAKE_CURRENT_BINARY_DIR` directory rather than the original csrc dir.
|
||||||
|
#
|
||||||
|
set(HIP_SRCS)
|
||||||
|
foreach (SRC ${SRCS})
|
||||||
|
string(REGEX REPLACE "\.cu$" "\.hip" SRC ${SRC})
|
||||||
|
string(REGEX REPLACE "cuda" "hip" SRC ${SRC})
|
||||||
|
list(APPEND HIP_SRCS "${CMAKE_CURRENT_BINARY_DIR}/${SRC}")
|
||||||
|
endforeach()
|
||||||
|
|
||||||
|
set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
|
||||||
|
add_custom_target(
|
||||||
|
hipify${NAME}
|
||||||
|
COMMAND ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
|
||||||
|
DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS}
|
||||||
|
BYPRODUCTS ${HIP_SRCS}
|
||||||
|
COMMENT "Running hipify on ${NAME} extension source files.")
|
||||||
|
|
||||||
|
# Swap out original extension sources with hipified sources.
|
||||||
|
list(APPEND HIP_SRCS ${CXX_SRCS})
|
||||||
|
set(${OUT_SRCS} ${HIP_SRCS} PARENT_SCOPE)
|
||||||
|
endfunction()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Get additional GPU compiler flags from torch.
|
||||||
|
#
|
||||||
|
function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
|
||||||
|
if (${GPU_LANG} STREQUAL "CUDA")
|
||||||
|
#
|
||||||
|
# Get common NVCC flags from torch.
|
||||||
|
#
|
||||||
|
run_python(GPU_FLAGS
|
||||||
|
"from torch.utils.cpp_extension import COMMON_NVCC_FLAGS; print(';'.join(COMMON_NVCC_FLAGS))"
|
||||||
|
"Failed to determine torch nvcc compiler flags")
|
||||||
|
|
||||||
|
if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
|
||||||
|
list(APPEND GPU_FLAGS "-DENABLE_FP8")
|
||||||
|
endif()
|
||||||
|
if (CUDA_VERSION VERSION_GREATER_EQUAL 12.0)
|
||||||
|
list(REMOVE_ITEM GPU_FLAGS
|
||||||
|
"-D__CUDA_NO_HALF_OPERATORS__"
|
||||||
|
"-D__CUDA_NO_HALF_CONVERSIONS__"
|
||||||
|
"-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
|
||||||
|
"-D__CUDA_NO_HALF2_OPERATORS__")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
elseif(${GPU_LANG} STREQUAL "HIP")
|
||||||
|
#
|
||||||
|
# Get common HIP/HIPCC flags from torch.
|
||||||
|
#
|
||||||
|
run_python(GPU_FLAGS
|
||||||
|
"import torch.utils.cpp_extension as t; print(';'.join(t.COMMON_HIP_FLAGS + t.COMMON_HIPCC_FLAGS))"
|
||||||
|
"Failed to determine torch nvcc compiler flags")
|
||||||
|
|
||||||
|
list(APPEND GPU_FLAGS
|
||||||
|
"-DUSE_ROCM"
|
||||||
|
"-DENABLE_FP8"
|
||||||
|
"-U__HIP_NO_HALF_CONVERSIONS__"
|
||||||
|
"-U__HIP_NO_HALF_OPERATORS__"
|
||||||
|
"-fno-gpu-rdc")
|
||||||
|
|
||||||
|
endif()
|
||||||
|
set(${OUT_GPU_FLAGS} ${GPU_FLAGS} PARENT_SCOPE)
|
||||||
|
endfunction()
|
||||||
|
|
||||||
|
# Macro for converting a `gencode` version number to a cmake version number.
|
||||||
|
macro(string_to_ver OUT_VER IN_STR)
|
||||||
|
string(REGEX REPLACE "\([0-9]+\)\([0-9]\)" "\\1.\\2" ${OUT_VER} ${IN_STR})
|
||||||
|
endmacro()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Override the GPU architectures detected by cmake/torch and filter them by
|
||||||
|
# `GPU_SUPPORTED_ARCHES`. Sets the final set of architectures in
|
||||||
|
# `GPU_ARCHES`.
|
||||||
|
#
|
||||||
|
# Note: this is defined as a macro since it updates `CMAKE_CUDA_FLAGS`.
|
||||||
|
#
|
||||||
|
macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
|
||||||
|
set(_GPU_SUPPORTED_ARCHES_LIST ${GPU_SUPPORTED_ARCHES} ${ARGN})
|
||||||
|
message(STATUS "${GPU_LANG} supported arches: ${_GPU_SUPPORTED_ARCHES_LIST}")
|
||||||
|
|
||||||
|
if (${GPU_LANG} STREQUAL "HIP")
|
||||||
|
#
|
||||||
|
# `GPU_ARCHES` controls the `--offload-arch` flags.
|
||||||
|
# `CMAKE_HIP_ARCHITECTURES` is set up by torch and can be controlled
|
||||||
|
# via the `PYTORCH_ROCM_ARCH` env variable.
|
||||||
|
#
|
||||||
|
|
||||||
|
#
|
||||||
|
# Find the intersection of the supported + detected architectures to
|
||||||
|
# set the module architecture flags.
|
||||||
|
#
|
||||||
|
set(${GPU_ARCHES})
|
||||||
|
foreach (_ARCH ${CMAKE_HIP_ARCHITECTURES})
|
||||||
|
if (_ARCH IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
|
||||||
|
list(APPEND ${GPU_ARCHES} ${_ARCH})
|
||||||
|
endif()
|
||||||
|
endforeach()
|
||||||
|
|
||||||
|
if(NOT ${GPU_ARCHES})
|
||||||
|
message(FATAL_ERROR
|
||||||
|
"None of the detected ROCm architectures: ${CMAKE_HIP_ARCHITECTURES} is"
|
||||||
|
" supported. Supported ROCm architectures are: ${_GPU_SUPPORTED_ARCHES_LIST}.")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
elseif(${GPU_LANG} STREQUAL "CUDA")
|
||||||
|
#
|
||||||
|
# Setup/process CUDA arch flags.
|
||||||
|
#
|
||||||
|
# The torch cmake setup hardcodes the detected architecture flags in
|
||||||
|
# `CMAKE_CUDA_FLAGS`. Since `CMAKE_CUDA_FLAGS` is a "global" variable, it
|
||||||
|
# can't modified on a per-target basis, e.g. for the `punica` extension.
|
||||||
|
# So, all the `-gencode` flags need to be extracted and removed from
|
||||||
|
# `CMAKE_CUDA_FLAGS` for processing so they can be passed by another method.
|
||||||
|
# Since it's not possible to use `target_compiler_options` for adding target
|
||||||
|
# specific `-gencode` arguments, the target's `CUDA_ARCHITECTURES` property
|
||||||
|
# must be used instead. This requires repackaging the architecture flags
|
||||||
|
# into a format that cmake expects for `CUDA_ARCHITECTURES`.
|
||||||
|
#
|
||||||
|
# This is a bit fragile in that it depends on torch using `-gencode` as opposed
|
||||||
|
# to one of the other nvcc options to specify architectures.
|
||||||
|
#
|
||||||
|
# Note: torch uses the `TORCH_CUDA_ARCH_LIST` environment variable to override
|
||||||
|
# detected architectures.
|
||||||
|
#
|
||||||
|
message(DEBUG "initial CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
|
||||||
|
|
||||||
|
# Extract all `-gencode` flags from `CMAKE_CUDA_FLAGS`
|
||||||
|
string(REGEX MATCHALL "-gencode arch=[^ ]+" _CUDA_ARCH_FLAGS
|
||||||
|
${CMAKE_CUDA_FLAGS})
|
||||||
|
|
||||||
|
# Remove all `-gencode` flags from `CMAKE_CUDA_FLAGS` since they will be modified
|
||||||
|
# and passed back via the `CUDA_ARCHITECTURES` property.
|
||||||
|
string(REGEX REPLACE "-gencode arch=[^ ]+ *" "" CMAKE_CUDA_FLAGS
|
||||||
|
${CMAKE_CUDA_FLAGS})
|
||||||
|
|
||||||
|
# If this error is triggered, it might mean that torch has changed how it sets
|
||||||
|
# up nvcc architecture code generation flags.
|
||||||
|
if (NOT _CUDA_ARCH_FLAGS)
|
||||||
|
message(FATAL_ERROR
|
||||||
|
"Could not find any architecture related code generation flags in "
|
||||||
|
"CMAKE_CUDA_FLAGS. (${CMAKE_CUDA_FLAGS})")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
message(DEBUG "final CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
|
||||||
|
message(DEBUG "arch flags: ${_CUDA_ARCH_FLAGS}")
|
||||||
|
|
||||||
|
# Initialize the architecture lists to empty.
|
||||||
|
set(${GPU_ARCHES})
|
||||||
|
|
||||||
|
# Process each `gencode` flag.
|
||||||
|
foreach(_ARCH ${_CUDA_ARCH_FLAGS})
|
||||||
|
# For each flag, extract the version number and whether it refers to PTX
|
||||||
|
# or native code.
|
||||||
|
# Note: if a regex matches then `CMAKE_MATCH_1` holds the binding
|
||||||
|
# for that match.
|
||||||
|
|
||||||
|
string(REGEX MATCH "arch=compute_\([0-9]+a?\)" _COMPUTE ${_ARCH})
|
||||||
|
if (_COMPUTE)
|
||||||
|
set(_COMPUTE ${CMAKE_MATCH_1})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
string(REGEX MATCH "code=sm_\([0-9]+a?\)" _SM ${_ARCH})
|
||||||
|
if (_SM)
|
||||||
|
set(_SM ${CMAKE_MATCH_1})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
string(REGEX MATCH "code=compute_\([0-9]+a?\)" _CODE ${_ARCH})
|
||||||
|
if (_CODE)
|
||||||
|
set(_CODE ${CMAKE_MATCH_1})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# Make sure the virtual architecture can be matched.
|
||||||
|
if (NOT _COMPUTE)
|
||||||
|
message(FATAL_ERROR
|
||||||
|
"Could not determine virtual architecture from: ${_ARCH}.")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# One of sm_ or compute_ must exist.
|
||||||
|
if ((NOT _SM) AND (NOT _CODE))
|
||||||
|
message(FATAL_ERROR
|
||||||
|
"Could not determine a codegen architecture from: ${_ARCH}.")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (_SM)
|
||||||
|
# -real suffix let CMake to only generate elf code for the kernels.
|
||||||
|
# we want this, otherwise the added ptx (default) will increase binary size.
|
||||||
|
set(_VIRT "-real")
|
||||||
|
set(_CODE_ARCH ${_SM})
|
||||||
|
else()
|
||||||
|
# -virtual suffix let CMake to generate ptx code for the kernels.
|
||||||
|
set(_VIRT "-virtual")
|
||||||
|
set(_CODE_ARCH ${_CODE})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# Check if the current version is in the supported arch list.
|
||||||
|
string_to_ver(_CODE_VER ${_CODE_ARCH})
|
||||||
|
if (NOT _CODE_VER IN_LIST _GPU_SUPPORTED_ARCHES_LIST)
|
||||||
|
message(STATUS "discarding unsupported CUDA arch ${_VER}.")
|
||||||
|
continue()
|
||||||
|
endif()
|
||||||
|
|
||||||
|
# Add it to the arch list.
|
||||||
|
list(APPEND ${GPU_ARCHES} "${_CODE_ARCH}${_VIRT}")
|
||||||
|
endforeach()
|
||||||
|
endif()
|
||||||
|
message(STATUS "${GPU_LANG} target arches: ${${GPU_ARCHES}}")
|
||||||
|
endmacro()
|
||||||
|
|
||||||
|
#
|
||||||
|
# Define a target named `GPU_MOD_NAME` for a single extension. The
|
||||||
|
# arguments are:
|
||||||
|
#
|
||||||
|
# DESTINATION <dest> - Module destination directory.
|
||||||
|
# LANGUAGE <lang> - The GPU language for this module, e.g CUDA, HIP,
|
||||||
|
# etc.
|
||||||
|
# SOURCES <sources> - List of source files relative to CMakeLists.txt
|
||||||
|
# directory.
|
||||||
|
#
|
||||||
|
# Optional arguments:
|
||||||
|
#
|
||||||
|
# ARCHITECTURES <arches> - A list of target GPU architectures in cmake
|
||||||
|
# format.
|
||||||
|
# Refer `CMAKE_CUDA_ARCHITECTURES` documentation
|
||||||
|
# and `CMAKE_HIP_ARCHITECTURES` for more info.
|
||||||
|
# ARCHITECTURES will use cmake's defaults if
|
||||||
|
# not provided.
|
||||||
|
# COMPILE_FLAGS <flags> - Extra compiler flags passed to NVCC/hip.
|
||||||
|
# INCLUDE_DIRECTORIES <dirs> - Extra include directories.
|
||||||
|
# LIBRARIES <libraries> - Extra link libraries.
|
||||||
|
# WITH_SOABI - Generate library with python SOABI suffix name.
|
||||||
|
# USE_SABI <version> - Use python stable api <version>
|
||||||
|
#
|
||||||
|
# Note: optimization level/debug info is set via cmake build type.
|
||||||
|
#
|
||||||
|
function (define_gpu_extension_target GPU_MOD_NAME)
|
||||||
|
cmake_parse_arguments(PARSE_ARGV 1
|
||||||
|
GPU
|
||||||
|
"WITH_SOABI"
|
||||||
|
"DESTINATION;LANGUAGE;USE_SABI"
|
||||||
|
"SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")
|
||||||
|
|
||||||
|
# Add hipify preprocessing step when building with HIP/ROCm.
|
||||||
|
if (GPU_LANGUAGE STREQUAL "HIP")
|
||||||
|
hipify_sources_target(GPU_SOURCES ${GPU_MOD_NAME} "${GPU_SOURCES}")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GPU_WITH_SOABI)
|
||||||
|
set(GPU_WITH_SOABI WITH_SOABI)
|
||||||
|
else()
|
||||||
|
set(GPU_WITH_SOABI)
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GPU_USE_SABI)
|
||||||
|
Python_add_library(${GPU_MOD_NAME} MODULE USE_SABI ${GPU_USE_SABI} ${GPU_WITH_SOABI} "${GPU_SOURCES}")
|
||||||
|
else()
|
||||||
|
Python_add_library(${GPU_MOD_NAME} MODULE ${GPU_WITH_SOABI} "${GPU_SOURCES}")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GPU_LANGUAGE STREQUAL "HIP")
|
||||||
|
# Make this target dependent on the hipify preprocessor step.
|
||||||
|
add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
if (GPU_ARCHITECTURES)
|
||||||
|
set_target_properties(${GPU_MOD_NAME} PROPERTIES
|
||||||
|
${GPU_LANGUAGE}_ARCHITECTURES "${GPU_ARCHITECTURES}")
|
||||||
|
endif()
|
||||||
|
|
||||||
|
set_property(TARGET ${GPU_MOD_NAME} PROPERTY CXX_STANDARD 17)
|
||||||
|
|
||||||
|
target_compile_options(${GPU_MOD_NAME} PRIVATE
|
||||||
|
$<$<COMPILE_LANGUAGE:${GPU_LANGUAGE}>:${GPU_COMPILE_FLAGS}>)
|
||||||
|
|
||||||
|
target_compile_definitions(${GPU_MOD_NAME} PRIVATE
|
||||||
|
"-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}")
|
||||||
|
|
||||||
|
target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
|
||||||
|
${GPU_INCLUDE_DIRECTORIES})
|
||||||
|
|
||||||
|
target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${torch_python_LIBRARY}
|
||||||
|
${GPU_LIBRARIES})
|
||||||
|
|
||||||
|
# Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
|
||||||
|
# dependencies that are not necessary and may not be installed.
|
||||||
|
if (GPU_LANGUAGE STREQUAL "CUDA")
|
||||||
|
target_link_libraries(${GPU_MOD_NAME} PRIVATE ${CUDA_CUDA_LIB}
|
||||||
|
${CUDA_LIBRARIES})
|
||||||
|
else()
|
||||||
|
target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION})
|
||||||
|
endfunction()
|
||||||
728
collect_env.py
Normal file
728
collect_env.py
Normal file
@ -0,0 +1,728 @@
|
|||||||
|
# ruff: noqa
|
||||||
|
# code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py
|
||||||
|
|
||||||
|
# Unlike the rest of the PyTorch this file must be python2 compliant.
|
||||||
|
# This script outputs relevant system environment info
|
||||||
|
# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
|
||||||
|
import datetime
|
||||||
|
import locale
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from collections import namedtuple
|
||||||
|
|
||||||
|
try:
|
||||||
|
import torch
|
||||||
|
TORCH_AVAILABLE = True
|
||||||
|
except (ImportError, NameError, AttributeError, OSError):
|
||||||
|
TORCH_AVAILABLE = False
|
||||||
|
|
||||||
|
# System Environment Information
|
||||||
|
SystemEnv = namedtuple(
|
||||||
|
'SystemEnv',
|
||||||
|
[
|
||||||
|
'torch_version',
|
||||||
|
'is_debug_build',
|
||||||
|
'cuda_compiled_version',
|
||||||
|
'gcc_version',
|
||||||
|
'clang_version',
|
||||||
|
'cmake_version',
|
||||||
|
'os',
|
||||||
|
'libc_version',
|
||||||
|
'python_version',
|
||||||
|
'python_platform',
|
||||||
|
'is_cuda_available',
|
||||||
|
'cuda_runtime_version',
|
||||||
|
'cuda_module_loading',
|
||||||
|
'nvidia_driver_version',
|
||||||
|
'nvidia_gpu_models',
|
||||||
|
'cudnn_version',
|
||||||
|
'pip_version', # 'pip' or 'pip3'
|
||||||
|
'pip_packages',
|
||||||
|
'conda_packages',
|
||||||
|
'hip_compiled_version',
|
||||||
|
'hip_runtime_version',
|
||||||
|
'miopen_runtime_version',
|
||||||
|
'caching_allocator_config',
|
||||||
|
'is_xnnpack_available',
|
||||||
|
'cpu_info',
|
||||||
|
'rocm_version', # vllm specific field
|
||||||
|
'neuron_sdk_version', # vllm specific field
|
||||||
|
'vllm_version', # vllm specific field
|
||||||
|
'vllm_build_flags', # vllm specific field
|
||||||
|
'gpu_topo', # vllm specific field
|
||||||
|
])
|
||||||
|
|
||||||
|
DEFAULT_CONDA_PATTERNS = {
|
||||||
|
"torch",
|
||||||
|
"numpy",
|
||||||
|
"cudatoolkit",
|
||||||
|
"soumith",
|
||||||
|
"mkl",
|
||||||
|
"magma",
|
||||||
|
"triton",
|
||||||
|
"optree",
|
||||||
|
"nccl",
|
||||||
|
"transformers",
|
||||||
|
}
|
||||||
|
|
||||||
|
DEFAULT_PIP_PATTERNS = {
|
||||||
|
"torch",
|
||||||
|
"numpy",
|
||||||
|
"mypy",
|
||||||
|
"flake8",
|
||||||
|
"triton",
|
||||||
|
"optree",
|
||||||
|
"onnx",
|
||||||
|
"nccl",
|
||||||
|
"transformers",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def run(command):
|
||||||
|
"""Return (return-code, stdout, stderr)."""
|
||||||
|
shell = True if type(command) is str else False
|
||||||
|
p = subprocess.Popen(command,
|
||||||
|
stdout=subprocess.PIPE,
|
||||||
|
stderr=subprocess.PIPE,
|
||||||
|
shell=shell)
|
||||||
|
raw_output, raw_err = p.communicate()
|
||||||
|
rc = p.returncode
|
||||||
|
if get_platform() == 'win32':
|
||||||
|
enc = 'oem'
|
||||||
|
else:
|
||||||
|
enc = locale.getpreferredencoding()
|
||||||
|
output = raw_output.decode(enc)
|
||||||
|
err = raw_err.decode(enc)
|
||||||
|
return rc, output.strip(), err.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def run_and_read_all(run_lambda, command):
|
||||||
|
"""Run command using run_lambda; reads and returns entire output if rc is 0."""
|
||||||
|
rc, out, _ = run_lambda(command)
|
||||||
|
if rc != 0:
|
||||||
|
return None
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def run_and_parse_first_match(run_lambda, command, regex):
|
||||||
|
"""Run command using run_lambda, returns the first regex match if it exists."""
|
||||||
|
rc, out, _ = run_lambda(command)
|
||||||
|
if rc != 0:
|
||||||
|
return None
|
||||||
|
match = re.search(regex, out)
|
||||||
|
if match is None:
|
||||||
|
return None
|
||||||
|
return match.group(1)
|
||||||
|
|
||||||
|
|
||||||
|
def run_and_return_first_line(run_lambda, command):
|
||||||
|
"""Run command using run_lambda and returns first line if output is not empty."""
|
||||||
|
rc, out, _ = run_lambda(command)
|
||||||
|
if rc != 0:
|
||||||
|
return None
|
||||||
|
return out.split('\n')[0]
|
||||||
|
|
||||||
|
|
||||||
|
def get_conda_packages(run_lambda, patterns=None):
|
||||||
|
if patterns is None:
|
||||||
|
patterns = DEFAULT_CONDA_PATTERNS
|
||||||
|
conda = os.environ.get('CONDA_EXE', 'conda')
|
||||||
|
out = run_and_read_all(run_lambda, "{} list".format(conda))
|
||||||
|
if out is None:
|
||||||
|
return out
|
||||||
|
|
||||||
|
return "\n".join(line for line in out.splitlines()
|
||||||
|
if not line.startswith("#") and any(name in line
|
||||||
|
for name in patterns))
|
||||||
|
|
||||||
|
|
||||||
|
def get_gcc_version(run_lambda):
|
||||||
|
return run_and_parse_first_match(run_lambda, 'gcc --version', r'gcc (.*)')
|
||||||
|
|
||||||
|
|
||||||
|
def get_clang_version(run_lambda):
|
||||||
|
return run_and_parse_first_match(run_lambda, 'clang --version',
|
||||||
|
r'clang version (.*)')
|
||||||
|
|
||||||
|
|
||||||
|
def get_cmake_version(run_lambda):
|
||||||
|
return run_and_parse_first_match(run_lambda, 'cmake --version',
|
||||||
|
r'cmake (.*)')
|
||||||
|
|
||||||
|
|
||||||
|
def get_nvidia_driver_version(run_lambda):
|
||||||
|
if get_platform() == 'darwin':
|
||||||
|
cmd = 'kextstat | grep -i cuda'
|
||||||
|
return run_and_parse_first_match(run_lambda, cmd,
|
||||||
|
r'com[.]nvidia[.]CUDA [(](.*?)[)]')
|
||||||
|
smi = get_nvidia_smi()
|
||||||
|
return run_and_parse_first_match(run_lambda, smi,
|
||||||
|
r'Driver Version: (.*?) ')
|
||||||
|
|
||||||
|
|
||||||
|
def get_gpu_info(run_lambda):
|
||||||
|
if get_platform() == 'darwin' or (TORCH_AVAILABLE and hasattr(
|
||||||
|
torch.version, 'hip') and torch.version.hip is not None):
|
||||||
|
if TORCH_AVAILABLE and torch.cuda.is_available():
|
||||||
|
if torch.version.hip is not None:
|
||||||
|
prop = torch.cuda.get_device_properties(0)
|
||||||
|
if hasattr(prop, "gcnArchName"):
|
||||||
|
gcnArch = " ({})".format(prop.gcnArchName)
|
||||||
|
else:
|
||||||
|
gcnArch = "NoGCNArchNameOnOldPyTorch"
|
||||||
|
else:
|
||||||
|
gcnArch = ""
|
||||||
|
return torch.cuda.get_device_name(None) + gcnArch
|
||||||
|
return None
|
||||||
|
smi = get_nvidia_smi()
|
||||||
|
uuid_regex = re.compile(r' \(UUID: .+?\)')
|
||||||
|
rc, out, _ = run_lambda(smi + ' -L')
|
||||||
|
if rc != 0:
|
||||||
|
return None
|
||||||
|
# Anonymize GPUs by removing their UUID
|
||||||
|
return re.sub(uuid_regex, '', out)
|
||||||
|
|
||||||
|
|
||||||
|
def get_running_cuda_version(run_lambda):
|
||||||
|
return run_and_parse_first_match(run_lambda, 'nvcc --version',
|
||||||
|
r'release .+ V(.*)')
|
||||||
|
|
||||||
|
|
||||||
|
def get_cudnn_version(run_lambda):
|
||||||
|
"""Return a list of libcudnn.so; it's hard to tell which one is being used."""
|
||||||
|
if get_platform() == 'win32':
|
||||||
|
system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
|
||||||
|
cuda_path = os.environ.get('CUDA_PATH', "%CUDA_PATH%")
|
||||||
|
where_cmd = os.path.join(system_root, 'System32', 'where')
|
||||||
|
cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path)
|
||||||
|
elif get_platform() == 'darwin':
|
||||||
|
# CUDA libraries and drivers can be found in /usr/local/cuda/. See
|
||||||
|
# https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#install
|
||||||
|
# https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installmac
|
||||||
|
# Use CUDNN_LIBRARY when cudnn library is installed elsewhere.
|
||||||
|
cudnn_cmd = 'ls /usr/local/cuda/lib/libcudnn*'
|
||||||
|
else:
|
||||||
|
cudnn_cmd = 'ldconfig -p | grep libcudnn | rev | cut -d" " -f1 | rev'
|
||||||
|
rc, out, _ = run_lambda(cudnn_cmd)
|
||||||
|
# find will return 1 if there are permission errors or if not found
|
||||||
|
if len(out) == 0 or (rc != 1 and rc != 0):
|
||||||
|
l = os.environ.get('CUDNN_LIBRARY')
|
||||||
|
if l is not None and os.path.isfile(l):
|
||||||
|
return os.path.realpath(l)
|
||||||
|
return None
|
||||||
|
files_set = set()
|
||||||
|
for fn in out.split('\n'):
|
||||||
|
fn = os.path.realpath(fn) # eliminate symbolic links
|
||||||
|
if os.path.isfile(fn):
|
||||||
|
files_set.add(fn)
|
||||||
|
if not files_set:
|
||||||
|
return None
|
||||||
|
# Alphabetize the result because the order is non-deterministic otherwise
|
||||||
|
files = sorted(files_set)
|
||||||
|
if len(files) == 1:
|
||||||
|
return files[0]
|
||||||
|
result = '\n'.join(files)
|
||||||
|
return 'Probably one of the following:\n{}'.format(result)
|
||||||
|
|
||||||
|
|
||||||
|
def get_nvidia_smi():
|
||||||
|
# Note: nvidia-smi is currently available only on Windows and Linux
|
||||||
|
smi = 'nvidia-smi'
|
||||||
|
if get_platform() == 'win32':
|
||||||
|
system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
|
||||||
|
program_files_root = os.environ.get('PROGRAMFILES',
|
||||||
|
'C:\\Program Files')
|
||||||
|
legacy_path = os.path.join(program_files_root, 'NVIDIA Corporation',
|
||||||
|
'NVSMI', smi)
|
||||||
|
new_path = os.path.join(system_root, 'System32', smi)
|
||||||
|
smis = [new_path, legacy_path]
|
||||||
|
for candidate_smi in smis:
|
||||||
|
if os.path.exists(candidate_smi):
|
||||||
|
smi = '"{}"'.format(candidate_smi)
|
||||||
|
break
|
||||||
|
return smi
|
||||||
|
|
||||||
|
|
||||||
|
def get_rocm_version(run_lambda):
|
||||||
|
"""Returns the ROCm version if available, otherwise 'N/A'."""
|
||||||
|
return run_and_parse_first_match(run_lambda, 'hipcc --version',
|
||||||
|
r'HIP version: (\S+)')
|
||||||
|
|
||||||
|
|
||||||
|
def get_neuron_sdk_version(run_lambda):
|
||||||
|
# Adapted from your install script
|
||||||
|
try:
|
||||||
|
result = run_lambda(["neuron-ls"])
|
||||||
|
return result if result[0] == 0 else 'N/A'
|
||||||
|
except Exception:
|
||||||
|
return 'N/A'
|
||||||
|
|
||||||
|
|
||||||
|
def get_vllm_version():
|
||||||
|
try:
|
||||||
|
import vllm
|
||||||
|
return vllm.__version__
|
||||||
|
except ImportError:
|
||||||
|
return 'N/A'
|
||||||
|
|
||||||
|
|
||||||
|
def summarize_vllm_build_flags():
|
||||||
|
# This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
|
||||||
|
return 'CUDA Archs: {}; ROCm: {}; Neuron: {}'.format(
|
||||||
|
os.environ.get('TORCH_CUDA_ARCH_LIST', 'Not Set'),
|
||||||
|
'Enabled' if os.environ.get('ROCM_HOME') else 'Disabled',
|
||||||
|
'Enabled' if os.environ.get('NEURON_CORES') else 'Disabled',
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_gpu_topo(run_lambda):
|
||||||
|
if get_platform() == 'linux':
|
||||||
|
return run_and_read_all(run_lambda, 'nvidia-smi topo -m')
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# example outputs of CPU infos
|
||||||
|
# * linux
|
||||||
|
# Architecture: x86_64
|
||||||
|
# CPU op-mode(s): 32-bit, 64-bit
|
||||||
|
# Address sizes: 46 bits physical, 48 bits virtual
|
||||||
|
# Byte Order: Little Endian
|
||||||
|
# CPU(s): 128
|
||||||
|
# On-line CPU(s) list: 0-127
|
||||||
|
# Vendor ID: GenuineIntel
|
||||||
|
# Model name: Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
|
||||||
|
# CPU family: 6
|
||||||
|
# Model: 106
|
||||||
|
# Thread(s) per core: 2
|
||||||
|
# Core(s) per socket: 32
|
||||||
|
# Socket(s): 2
|
||||||
|
# Stepping: 6
|
||||||
|
# BogoMIPS: 5799.78
|
||||||
|
# Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr
|
||||||
|
# sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon rep_good nopl
|
||||||
|
# xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq monitor ssse3 fma cx16
|
||||||
|
# pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand
|
||||||
|
# hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced
|
||||||
|
# fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid avx512f avx512dq rdseed adx smap
|
||||||
|
# avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1
|
||||||
|
# xsaves wbnoinvd ida arat avx512vbmi pku ospke avx512_vbmi2 gfni vaes vpclmulqdq
|
||||||
|
# avx512_vnni avx512_bitalg tme avx512_vpopcntdq rdpid md_clear flush_l1d arch_capabilities
|
||||||
|
# Virtualization features:
|
||||||
|
# Hypervisor vendor: KVM
|
||||||
|
# Virtualization type: full
|
||||||
|
# Caches (sum of all):
|
||||||
|
# L1d: 3 MiB (64 instances)
|
||||||
|
# L1i: 2 MiB (64 instances)
|
||||||
|
# L2: 80 MiB (64 instances)
|
||||||
|
# L3: 108 MiB (2 instances)
|
||||||
|
# NUMA:
|
||||||
|
# NUMA node(s): 2
|
||||||
|
# NUMA node0 CPU(s): 0-31,64-95
|
||||||
|
# NUMA node1 CPU(s): 32-63,96-127
|
||||||
|
# Vulnerabilities:
|
||||||
|
# Itlb multihit: Not affected
|
||||||
|
# L1tf: Not affected
|
||||||
|
# Mds: Not affected
|
||||||
|
# Meltdown: Not affected
|
||||||
|
# Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown
|
||||||
|
# Retbleed: Not affected
|
||||||
|
# Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp
|
||||||
|
# Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization
|
||||||
|
# Spectre v2: Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence
|
||||||
|
# Srbds: Not affected
|
||||||
|
# Tsx async abort: Not affected
|
||||||
|
# * win32
|
||||||
|
# Architecture=9
|
||||||
|
# CurrentClockSpeed=2900
|
||||||
|
# DeviceID=CPU0
|
||||||
|
# Family=179
|
||||||
|
# L2CacheSize=40960
|
||||||
|
# L2CacheSpeed=
|
||||||
|
# Manufacturer=GenuineIntel
|
||||||
|
# MaxClockSpeed=2900
|
||||||
|
# Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
|
||||||
|
# ProcessorType=3
|
||||||
|
# Revision=27142
|
||||||
|
#
|
||||||
|
# Architecture=9
|
||||||
|
# CurrentClockSpeed=2900
|
||||||
|
# DeviceID=CPU1
|
||||||
|
# Family=179
|
||||||
|
# L2CacheSize=40960
|
||||||
|
# L2CacheSpeed=
|
||||||
|
# Manufacturer=GenuineIntel
|
||||||
|
# MaxClockSpeed=2900
|
||||||
|
# Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
|
||||||
|
# ProcessorType=3
|
||||||
|
# Revision=27142
|
||||||
|
|
||||||
|
|
||||||
|
def get_cpu_info(run_lambda):
|
||||||
|
rc, out, err = 0, '', ''
|
||||||
|
if get_platform() == 'linux':
|
||||||
|
rc, out, err = run_lambda('lscpu')
|
||||||
|
elif get_platform() == 'win32':
|
||||||
|
rc, out, err = run_lambda(
|
||||||
|
'wmic cpu get Name,Manufacturer,Family,Architecture,ProcessorType,DeviceID, \
|
||||||
|
CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision /VALUE'
|
||||||
|
)
|
||||||
|
elif get_platform() == 'darwin':
|
||||||
|
rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string")
|
||||||
|
cpu_info = 'None'
|
||||||
|
if rc == 0:
|
||||||
|
cpu_info = out
|
||||||
|
else:
|
||||||
|
cpu_info = err
|
||||||
|
return cpu_info
|
||||||
|
|
||||||
|
|
||||||
|
def get_platform():
|
||||||
|
if sys.platform.startswith('linux'):
|
||||||
|
return 'linux'
|
||||||
|
elif sys.platform.startswith('win32'):
|
||||||
|
return 'win32'
|
||||||
|
elif sys.platform.startswith('cygwin'):
|
||||||
|
return 'cygwin'
|
||||||
|
elif sys.platform.startswith('darwin'):
|
||||||
|
return 'darwin'
|
||||||
|
else:
|
||||||
|
return sys.platform
|
||||||
|
|
||||||
|
|
||||||
|
def get_mac_version(run_lambda):
|
||||||
|
return run_and_parse_first_match(run_lambda, 'sw_vers -productVersion',
|
||||||
|
r'(.*)')
|
||||||
|
|
||||||
|
|
||||||
|
def get_windows_version(run_lambda):
|
||||||
|
system_root = os.environ.get('SYSTEMROOT', 'C:\\Windows')
|
||||||
|
wmic_cmd = os.path.join(system_root, 'System32', 'Wbem', 'wmic')
|
||||||
|
findstr_cmd = os.path.join(system_root, 'System32', 'findstr')
|
||||||
|
return run_and_read_all(
|
||||||
|
run_lambda,
|
||||||
|
'{} os get Caption | {} /v Caption'.format(wmic_cmd, findstr_cmd))
|
||||||
|
|
||||||
|
|
||||||
|
def get_lsb_version(run_lambda):
|
||||||
|
return run_and_parse_first_match(run_lambda, 'lsb_release -a',
|
||||||
|
r'Description:\t(.*)')
|
||||||
|
|
||||||
|
|
||||||
|
def check_release_file(run_lambda):
|
||||||
|
return run_and_parse_first_match(run_lambda, 'cat /etc/*-release',
|
||||||
|
r'PRETTY_NAME="(.*)"')
|
||||||
|
|
||||||
|
|
||||||
|
def get_os(run_lambda):
|
||||||
|
from platform import machine
|
||||||
|
platform = get_platform()
|
||||||
|
|
||||||
|
if platform == 'win32' or platform == 'cygwin':
|
||||||
|
return get_windows_version(run_lambda)
|
||||||
|
|
||||||
|
if platform == 'darwin':
|
||||||
|
version = get_mac_version(run_lambda)
|
||||||
|
if version is None:
|
||||||
|
return None
|
||||||
|
return 'macOS {} ({})'.format(version, machine())
|
||||||
|
|
||||||
|
if platform == 'linux':
|
||||||
|
# Ubuntu/Debian based
|
||||||
|
desc = get_lsb_version(run_lambda)
|
||||||
|
if desc is not None:
|
||||||
|
return '{} ({})'.format(desc, machine())
|
||||||
|
|
||||||
|
# Try reading /etc/*-release
|
||||||
|
desc = check_release_file(run_lambda)
|
||||||
|
if desc is not None:
|
||||||
|
return '{} ({})'.format(desc, machine())
|
||||||
|
|
||||||
|
return '{} ({})'.format(platform, machine())
|
||||||
|
|
||||||
|
# Unknown platform
|
||||||
|
return platform
|
||||||
|
|
||||||
|
|
||||||
|
def get_python_platform():
|
||||||
|
import platform
|
||||||
|
return platform.platform()
|
||||||
|
|
||||||
|
|
||||||
|
def get_libc_version():
|
||||||
|
import platform
|
||||||
|
if get_platform() != 'linux':
|
||||||
|
return 'N/A'
|
||||||
|
return '-'.join(platform.libc_ver())
|
||||||
|
|
||||||
|
|
||||||
|
def get_pip_packages(run_lambda, patterns=None):
|
||||||
|
"""Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages."""
|
||||||
|
if patterns is None:
|
||||||
|
patterns = DEFAULT_PIP_PATTERNS
|
||||||
|
|
||||||
|
# People generally have `pip` as `pip` or `pip3`
|
||||||
|
# But here it is invoked as `python -mpip`
|
||||||
|
def run_with_pip(pip):
|
||||||
|
out = run_and_read_all(run_lambda, pip + ["list", "--format=freeze"])
|
||||||
|
return "\n".join(line for line in out.splitlines()
|
||||||
|
if any(name in line for name in patterns))
|
||||||
|
|
||||||
|
pip_version = 'pip3' if sys.version[0] == '3' else 'pip'
|
||||||
|
out = run_with_pip([sys.executable, '-mpip'])
|
||||||
|
|
||||||
|
return pip_version, out
|
||||||
|
|
||||||
|
|
||||||
|
def get_cachingallocator_config():
|
||||||
|
ca_config = os.environ.get('PYTORCH_CUDA_ALLOC_CONF', '')
|
||||||
|
return ca_config
|
||||||
|
|
||||||
|
|
||||||
|
def get_cuda_module_loading_config():
|
||||||
|
if TORCH_AVAILABLE and torch.cuda.is_available():
|
||||||
|
torch.cuda.init()
|
||||||
|
config = os.environ.get('CUDA_MODULE_LOADING', '')
|
||||||
|
return config
|
||||||
|
else:
|
||||||
|
return "N/A"
|
||||||
|
|
||||||
|
|
||||||
|
def is_xnnpack_available():
|
||||||
|
if TORCH_AVAILABLE:
|
||||||
|
import torch.backends.xnnpack
|
||||||
|
return str(
|
||||||
|
torch.backends.xnnpack.enabled) # type: ignore[attr-defined]
|
||||||
|
else:
|
||||||
|
return "N/A"
|
||||||
|
|
||||||
|
|
||||||
|
def get_env_info():
|
||||||
|
run_lambda = run
|
||||||
|
pip_version, pip_list_output = get_pip_packages(run_lambda)
|
||||||
|
|
||||||
|
if TORCH_AVAILABLE:
|
||||||
|
version_str = torch.__version__
|
||||||
|
debug_mode_str = str(torch.version.debug)
|
||||||
|
cuda_available_str = str(torch.cuda.is_available())
|
||||||
|
cuda_version_str = torch.version.cuda
|
||||||
|
if not hasattr(torch.version,
|
||||||
|
'hip') or torch.version.hip is None: # cuda version
|
||||||
|
hip_compiled_version = hip_runtime_version = miopen_runtime_version = 'N/A'
|
||||||
|
else: # HIP version
|
||||||
|
|
||||||
|
def get_version_or_na(cfg, prefix):
|
||||||
|
_lst = [s.rsplit(None, 1)[-1] for s in cfg if prefix in s]
|
||||||
|
return _lst[0] if _lst else 'N/A'
|
||||||
|
|
||||||
|
cfg = torch._C._show_config().split('\n')
|
||||||
|
hip_runtime_version = get_version_or_na(cfg, 'HIP Runtime')
|
||||||
|
miopen_runtime_version = get_version_or_na(cfg, 'MIOpen')
|
||||||
|
cuda_version_str = 'N/A'
|
||||||
|
hip_compiled_version = torch.version.hip
|
||||||
|
else:
|
||||||
|
version_str = debug_mode_str = cuda_available_str = cuda_version_str = 'N/A'
|
||||||
|
hip_compiled_version = hip_runtime_version = miopen_runtime_version = 'N/A'
|
||||||
|
|
||||||
|
sys_version = sys.version.replace("\n", " ")
|
||||||
|
|
||||||
|
conda_packages = get_conda_packages(run_lambda)
|
||||||
|
|
||||||
|
rocm_version = get_rocm_version(run_lambda)
|
||||||
|
neuron_sdk_version = get_neuron_sdk_version(run_lambda)
|
||||||
|
vllm_version = get_vllm_version()
|
||||||
|
vllm_build_flags = summarize_vllm_build_flags()
|
||||||
|
gpu_topo = get_gpu_topo(run_lambda)
|
||||||
|
|
||||||
|
return SystemEnv(
|
||||||
|
torch_version=version_str,
|
||||||
|
is_debug_build=debug_mode_str,
|
||||||
|
python_version='{} ({}-bit runtime)'.format(
|
||||||
|
sys_version,
|
||||||
|
sys.maxsize.bit_length() + 1),
|
||||||
|
python_platform=get_python_platform(),
|
||||||
|
is_cuda_available=cuda_available_str,
|
||||||
|
cuda_compiled_version=cuda_version_str,
|
||||||
|
cuda_runtime_version=get_running_cuda_version(run_lambda),
|
||||||
|
cuda_module_loading=get_cuda_module_loading_config(),
|
||||||
|
nvidia_gpu_models=get_gpu_info(run_lambda),
|
||||||
|
nvidia_driver_version=get_nvidia_driver_version(run_lambda),
|
||||||
|
cudnn_version=get_cudnn_version(run_lambda),
|
||||||
|
hip_compiled_version=hip_compiled_version,
|
||||||
|
hip_runtime_version=hip_runtime_version,
|
||||||
|
miopen_runtime_version=miopen_runtime_version,
|
||||||
|
pip_version=pip_version,
|
||||||
|
pip_packages=pip_list_output,
|
||||||
|
conda_packages=conda_packages,
|
||||||
|
os=get_os(run_lambda),
|
||||||
|
libc_version=get_libc_version(),
|
||||||
|
gcc_version=get_gcc_version(run_lambda),
|
||||||
|
clang_version=get_clang_version(run_lambda),
|
||||||
|
cmake_version=get_cmake_version(run_lambda),
|
||||||
|
caching_allocator_config=get_cachingallocator_config(),
|
||||||
|
is_xnnpack_available=is_xnnpack_available(),
|
||||||
|
cpu_info=get_cpu_info(run_lambda),
|
||||||
|
rocm_version=rocm_version,
|
||||||
|
neuron_sdk_version=neuron_sdk_version,
|
||||||
|
vllm_version=vllm_version,
|
||||||
|
vllm_build_flags=vllm_build_flags,
|
||||||
|
gpu_topo=gpu_topo,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
env_info_fmt = """
|
||||||
|
PyTorch version: {torch_version}
|
||||||
|
Is debug build: {is_debug_build}
|
||||||
|
CUDA used to build PyTorch: {cuda_compiled_version}
|
||||||
|
ROCM used to build PyTorch: {hip_compiled_version}
|
||||||
|
|
||||||
|
OS: {os}
|
||||||
|
GCC version: {gcc_version}
|
||||||
|
Clang version: {clang_version}
|
||||||
|
CMake version: {cmake_version}
|
||||||
|
Libc version: {libc_version}
|
||||||
|
|
||||||
|
Python version: {python_version}
|
||||||
|
Python platform: {python_platform}
|
||||||
|
Is CUDA available: {is_cuda_available}
|
||||||
|
CUDA runtime version: {cuda_runtime_version}
|
||||||
|
CUDA_MODULE_LOADING set to: {cuda_module_loading}
|
||||||
|
GPU models and configuration: {nvidia_gpu_models}
|
||||||
|
Nvidia driver version: {nvidia_driver_version}
|
||||||
|
cuDNN version: {cudnn_version}
|
||||||
|
HIP runtime version: {hip_runtime_version}
|
||||||
|
MIOpen runtime version: {miopen_runtime_version}
|
||||||
|
Is XNNPACK available: {is_xnnpack_available}
|
||||||
|
|
||||||
|
CPU:
|
||||||
|
{cpu_info}
|
||||||
|
|
||||||
|
Versions of relevant libraries:
|
||||||
|
{pip_packages}
|
||||||
|
{conda_packages}
|
||||||
|
""".strip()
|
||||||
|
|
||||||
|
# both the above code and the following code use `strip()` to
|
||||||
|
# remove leading/trailing whitespaces, so we need to add a newline
|
||||||
|
# in between to separate the two sections
|
||||||
|
env_info_fmt += "\n"
|
||||||
|
|
||||||
|
env_info_fmt += """
|
||||||
|
ROCM Version: {rocm_version}
|
||||||
|
Neuron SDK Version: {neuron_sdk_version}
|
||||||
|
vLLM Version: {vllm_version}
|
||||||
|
vLLM Build Flags:
|
||||||
|
{vllm_build_flags}
|
||||||
|
GPU Topology:
|
||||||
|
{gpu_topo}
|
||||||
|
""".strip()
|
||||||
|
|
||||||
|
|
||||||
|
def pretty_str(envinfo):
|
||||||
|
|
||||||
|
def replace_nones(dct, replacement='Could not collect'):
|
||||||
|
for key in dct.keys():
|
||||||
|
if dct[key] is not None:
|
||||||
|
continue
|
||||||
|
dct[key] = replacement
|
||||||
|
return dct
|
||||||
|
|
||||||
|
def replace_bools(dct, true='Yes', false='No'):
|
||||||
|
for key in dct.keys():
|
||||||
|
if dct[key] is True:
|
||||||
|
dct[key] = true
|
||||||
|
elif dct[key] is False:
|
||||||
|
dct[key] = false
|
||||||
|
return dct
|
||||||
|
|
||||||
|
def prepend(text, tag='[prepend]'):
|
||||||
|
lines = text.split('\n')
|
||||||
|
updated_lines = [tag + line for line in lines]
|
||||||
|
return '\n'.join(updated_lines)
|
||||||
|
|
||||||
|
def replace_if_empty(text, replacement='No relevant packages'):
|
||||||
|
if text is not None and len(text) == 0:
|
||||||
|
return replacement
|
||||||
|
return text
|
||||||
|
|
||||||
|
def maybe_start_on_next_line(string):
|
||||||
|
# If `string` is multiline, prepend a \n to it.
|
||||||
|
if string is not None and len(string.split('\n')) > 1:
|
||||||
|
return '\n{}\n'.format(string)
|
||||||
|
return string
|
||||||
|
|
||||||
|
mutable_dict = envinfo._asdict()
|
||||||
|
|
||||||
|
# If nvidia_gpu_models is multiline, start on the next line
|
||||||
|
mutable_dict['nvidia_gpu_models'] = \
|
||||||
|
maybe_start_on_next_line(envinfo.nvidia_gpu_models)
|
||||||
|
|
||||||
|
# If the machine doesn't have CUDA, report some fields as 'No CUDA'
|
||||||
|
dynamic_cuda_fields = [
|
||||||
|
'cuda_runtime_version',
|
||||||
|
'nvidia_gpu_models',
|
||||||
|
'nvidia_driver_version',
|
||||||
|
]
|
||||||
|
all_cuda_fields = dynamic_cuda_fields + ['cudnn_version']
|
||||||
|
all_dynamic_cuda_fields_missing = all(mutable_dict[field] is None
|
||||||
|
for field in dynamic_cuda_fields)
|
||||||
|
if TORCH_AVAILABLE and not torch.cuda.is_available(
|
||||||
|
) and all_dynamic_cuda_fields_missing:
|
||||||
|
for field in all_cuda_fields:
|
||||||
|
mutable_dict[field] = 'No CUDA'
|
||||||
|
if envinfo.cuda_compiled_version is None:
|
||||||
|
mutable_dict['cuda_compiled_version'] = 'None'
|
||||||
|
|
||||||
|
# Replace True with Yes, False with No
|
||||||
|
mutable_dict = replace_bools(mutable_dict)
|
||||||
|
|
||||||
|
# Replace all None objects with 'Could not collect'
|
||||||
|
mutable_dict = replace_nones(mutable_dict)
|
||||||
|
|
||||||
|
# If either of these are '', replace with 'No relevant packages'
|
||||||
|
mutable_dict['pip_packages'] = replace_if_empty(
|
||||||
|
mutable_dict['pip_packages'])
|
||||||
|
mutable_dict['conda_packages'] = replace_if_empty(
|
||||||
|
mutable_dict['conda_packages'])
|
||||||
|
|
||||||
|
# Tag conda and pip packages with a prefix
|
||||||
|
# If they were previously None, they'll show up as ie '[conda] Could not collect'
|
||||||
|
if mutable_dict['pip_packages']:
|
||||||
|
mutable_dict['pip_packages'] = prepend(
|
||||||
|
mutable_dict['pip_packages'], '[{}] '.format(envinfo.pip_version))
|
||||||
|
if mutable_dict['conda_packages']:
|
||||||
|
mutable_dict['conda_packages'] = prepend(
|
||||||
|
mutable_dict['conda_packages'], '[conda] ')
|
||||||
|
mutable_dict['cpu_info'] = envinfo.cpu_info
|
||||||
|
return env_info_fmt.format(**mutable_dict)
|
||||||
|
|
||||||
|
|
||||||
|
def get_pretty_env_info():
|
||||||
|
return pretty_str(get_env_info())
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("Collecting environment information...")
|
||||||
|
output = get_pretty_env_info()
|
||||||
|
print(output)
|
||||||
|
|
||||||
|
if TORCH_AVAILABLE and hasattr(torch, 'utils') and hasattr(
|
||||||
|
torch.utils, '_crash_handler'):
|
||||||
|
minidump_dir = torch.utils._crash_handler.DEFAULT_MINIDUMP_DIR
|
||||||
|
if sys.platform == "linux" and os.path.exists(minidump_dir):
|
||||||
|
dumps = [
|
||||||
|
os.path.join(minidump_dir, dump)
|
||||||
|
for dump in os.listdir(minidump_dir)
|
||||||
|
]
|
||||||
|
latest = max(dumps, key=os.path.getctime)
|
||||||
|
ctime = os.path.getctime(latest)
|
||||||
|
creation_time = datetime.datetime.fromtimestamp(ctime).strftime(
|
||||||
|
'%Y-%m-%d %H:%M:%S')
|
||||||
|
msg = "\n*** Detected a minidump at {} created on {}, ".format(latest, creation_time) + \
|
||||||
|
"if this is related to your bug please include it when you file a report ***"
|
||||||
|
print(msg, file=sys.stderr)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
@ -1,28 +0,0 @@
|
|||||||
#include <torch/extension.h>
|
|
||||||
|
|
||||||
void silu_and_mul(
|
|
||||||
torch::Tensor& out,
|
|
||||||
torch::Tensor& input);
|
|
||||||
|
|
||||||
void gelu_new(
|
|
||||||
torch::Tensor& out,
|
|
||||||
torch::Tensor& input);
|
|
||||||
|
|
||||||
void gelu_fast(
|
|
||||||
torch::Tensor& out,
|
|
||||||
torch::Tensor& input);
|
|
||||||
|
|
||||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
|
||||||
m.def(
|
|
||||||
"silu_and_mul",
|
|
||||||
&silu_and_mul,
|
|
||||||
"Activation function used in SwiGLU.");
|
|
||||||
m.def(
|
|
||||||
"gelu_new",
|
|
||||||
&gelu_new,
|
|
||||||
"GELU implementation used in GPT-2.");
|
|
||||||
m.def(
|
|
||||||
"gelu_fast",
|
|
||||||
&gelu_fast,
|
|
||||||
"Approximate GELU implementation.");
|
|
||||||
}
|
|
||||||
@ -1,114 +1,150 @@
|
|||||||
#include <torch/extension.h>
|
|
||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
|
#include <torch/all.h>
|
||||||
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
|
|
||||||
|
#include "cuda_compat.h"
|
||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
template<typename T>
|
// Activation and gating kernel template.
|
||||||
__device__ __forceinline__ T silu(const T& x) {
|
template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
|
||||||
// x * sigmoid(x)
|
__global__ void act_and_mul_kernel(
|
||||||
return (T) (((float) x) / (1.0f + expf((float) -x)));
|
scalar_t* __restrict__ out, // [..., d]
|
||||||
}
|
const scalar_t* __restrict__ input, // [..., 2, d]
|
||||||
|
const int d) {
|
||||||
template<typename scalar_t>
|
|
||||||
__global__ void silu_and_mul_kernel(
|
|
||||||
scalar_t* __restrict__ out, // [..., d]
|
|
||||||
const scalar_t* __restrict__ input, // [..., 2, d]
|
|
||||||
const int d) {
|
|
||||||
const int64_t token_idx = blockIdx.x;
|
const int64_t token_idx = blockIdx.x;
|
||||||
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
||||||
const scalar_t x = __ldg(&input[token_idx * 2 * d + idx]);
|
const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
|
||||||
const scalar_t y = __ldg(&input[token_idx * 2 * d + d + idx]);
|
const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
|
||||||
out[token_idx * d + idx] = silu(x) * y;
|
out[token_idx * d + idx] = ACT_FN(x) * y;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace vllm
|
template <typename T>
|
||||||
|
__device__ __forceinline__ T silu_kernel(const T& x) {
|
||||||
|
// x * sigmoid(x)
|
||||||
|
return (T)(((float)x) / (1.0f + expf((float)-x)));
|
||||||
|
}
|
||||||
|
|
||||||
void silu_and_mul(
|
template <typename T>
|
||||||
torch::Tensor& out, // [..., d]
|
__device__ __forceinline__ T gelu_kernel(const T& x) {
|
||||||
torch::Tensor& input) // [..., 2 * d]
|
// Equivalent to PyTorch GELU with 'none' approximation.
|
||||||
|
// Refer to:
|
||||||
|
// https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38
|
||||||
|
const float f = (float)x;
|
||||||
|
constexpr float ALPHA = M_SQRT1_2;
|
||||||
|
return (T)(f * 0.5f * (1.0f + ::erf(f * ALPHA)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
__device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
|
||||||
|
// Equivalent to PyTorch GELU with 'tanh' approximation.
|
||||||
|
// Refer to:
|
||||||
|
// https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30
|
||||||
|
const float f = (float)x;
|
||||||
|
constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f;
|
||||||
|
constexpr float KAPPA = 0.044715;
|
||||||
|
float x_cube = f * f * f;
|
||||||
|
float inner = BETA * (f + KAPPA * x_cube);
|
||||||
|
return (T)(0.5f * f * (1.0f + ::tanhf(inner)));
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace vllm
|
||||||
|
|
||||||
|
// Launch activation and gating kernel.
|
||||||
|
#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL) \
|
||||||
|
int d = input.size(-1) / 2; \
|
||||||
|
int64_t num_tokens = input.numel() / input.size(-1); \
|
||||||
|
dim3 grid(num_tokens); \
|
||||||
|
dim3 block(std::min(d, 1024)); \
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES( \
|
||||||
|
input.scalar_type(), "act_and_mul_kernel", [&] { \
|
||||||
|
vllm::act_and_mul_kernel<scalar_t, KERNEL<scalar_t>> \
|
||||||
|
<<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), \
|
||||||
|
input.data_ptr<scalar_t>(), d); \
|
||||||
|
});
|
||||||
|
|
||||||
|
void silu_and_mul(torch::Tensor& out, // [..., d]
|
||||||
|
torch::Tensor& input) // [..., 2 * d]
|
||||||
{
|
{
|
||||||
int64_t num_tokens = input.numel() / input.size(-1);
|
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
|
||||||
int d = input.size(-1) / 2;
|
}
|
||||||
|
|
||||||
dim3 grid(num_tokens);
|
void gelu_and_mul(torch::Tensor& out, // [..., d]
|
||||||
dim3 block(std::min(d, 1024));
|
torch::Tensor& input) // [..., 2 * d]
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
{
|
||||||
VLLM_DISPATCH_FLOATING_TYPES(
|
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel);
|
||||||
input.scalar_type(),
|
}
|
||||||
"silu_and_mul_kernel",
|
|
||||||
[&] {
|
void gelu_tanh_and_mul(torch::Tensor& out, // [..., d]
|
||||||
vllm::silu_and_mul_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
torch::Tensor& input) // [..., 2 * d]
|
||||||
out.data_ptr<scalar_t>(),
|
{
|
||||||
input.data_ptr<scalar_t>(),
|
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel);
|
||||||
d);
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
// Element-wise activation kernel template.
|
// Element-wise activation kernel template.
|
||||||
template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
|
template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
|
||||||
__global__ void activation_kernel(
|
__global__ void activation_kernel(
|
||||||
scalar_t* __restrict__ out, // [..., d]
|
scalar_t* __restrict__ out, // [..., d]
|
||||||
const scalar_t* __restrict__ input, // [..., d]
|
const scalar_t* __restrict__ input, // [..., d]
|
||||||
const int d) {
|
const int d) {
|
||||||
const int64_t token_idx = blockIdx.x;
|
const int64_t token_idx = blockIdx.x;
|
||||||
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
|
||||||
const scalar_t x = __ldg(&input[token_idx * d + idx]);
|
const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]);
|
||||||
out[token_idx * d + idx] = ACT_FN(x);
|
out[token_idx * d + idx] = ACT_FN(x);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|
||||||
// Launch element-wise activation kernel.
|
// Launch element-wise activation kernel.
|
||||||
#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \
|
#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \
|
||||||
int d = input.size(-1); \
|
int d = input.size(-1); \
|
||||||
int64_t num_tokens = input.numel() / d; \
|
int64_t num_tokens = input.numel() / d; \
|
||||||
dim3 grid(num_tokens); \
|
dim3 grid(num_tokens); \
|
||||||
dim3 block(std::min(d, 1024)); \
|
dim3 block(std::min(d, 1024)); \
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
|
||||||
VLLM_DISPATCH_FLOATING_TYPES( \
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
|
||||||
input.scalar_type(), \
|
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "activation_kernel", [&] { \
|
||||||
"activation_kernel", \
|
vllm::activation_kernel<scalar_t, KERNEL<scalar_t>> \
|
||||||
[&] { \
|
<<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), \
|
||||||
vllm::activation_kernel<scalar_t, KERNEL<scalar_t>><<<grid, block, 0, stream>>>( \
|
input.data_ptr<scalar_t>(), d); \
|
||||||
out.data_ptr<scalar_t>(), \
|
});
|
||||||
input.data_ptr<scalar_t>(), \
|
|
||||||
d); \
|
|
||||||
});
|
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
template<typename T>
|
template <typename T>
|
||||||
__device__ __forceinline__ T gelu_new_kernel(const T& x) {
|
__device__ __forceinline__ T gelu_new_kernel(const T& x) {
|
||||||
const float x3 = (float) (x * x * x);
|
const float x3 = (float)(x * x * x);
|
||||||
const T t = (T) tanhf((T) (0.79788456f * (float) (x + (T) (0.044715f * x3))));
|
const T t = (T)tanhf((T)(0.79788456f * (float)(x + (T)(0.044715f * x3))));
|
||||||
return ((T) 0.5) * x * (((T) 1.0) + t);
|
return ((T)0.5) * x * (((T)1.0) + t);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T>
|
template <typename T>
|
||||||
__device__ __forceinline__ T gelu_fast_kernel(const T& x) {
|
__device__ __forceinline__ T gelu_fast_kernel(const T& x) {
|
||||||
const float f = (float) x;
|
const float f = (float)x;
|
||||||
const T t = (T) tanhf(((T) (f * 0.79788456f)) * (((T) 1.0) + (T) (0.044715f * f) * x));
|
const T t =
|
||||||
return ((T) 0.5) * x * (((T) 1.0) + t);
|
(T)tanhf(((T)(f * 0.79788456f)) * (((T)1.0) + (T)(0.044715f * f) * x));
|
||||||
|
return ((T)0.5) * x * (((T)1.0) + t);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|
||||||
void gelu_new(
|
void gelu_new(torch::Tensor& out, // [..., d]
|
||||||
torch::Tensor& out, // [..., d]
|
torch::Tensor& input) // [..., d]
|
||||||
torch::Tensor& input) // [..., d]
|
|
||||||
{
|
{
|
||||||
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel);
|
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel);
|
||||||
}
|
}
|
||||||
|
|
||||||
void gelu_fast(
|
void gelu_fast(torch::Tensor& out, // [..., d]
|
||||||
torch::Tensor& out, // [..., d]
|
torch::Tensor& input) // [..., d]
|
||||||
torch::Tensor& input) // [..., d]
|
|
||||||
{
|
{
|
||||||
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
|
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,42 +0,0 @@
|
|||||||
#include <torch/extension.h>
|
|
||||||
#include <c10/util/Optional.h>
|
|
||||||
|
|
||||||
void paged_attention_v1(
|
|
||||||
torch::Tensor& out,
|
|
||||||
torch::Tensor& query,
|
|
||||||
torch::Tensor& key_cache,
|
|
||||||
torch::Tensor& value_cache,
|
|
||||||
torch::Tensor& head_mapping,
|
|
||||||
float scale,
|
|
||||||
torch::Tensor& block_tables,
|
|
||||||
torch::Tensor& context_lens,
|
|
||||||
int block_size,
|
|
||||||
int max_context_len,
|
|
||||||
const c10::optional<torch::Tensor>& alibi_slopes);
|
|
||||||
|
|
||||||
void paged_attention_v2(
|
|
||||||
torch::Tensor& out,
|
|
||||||
torch::Tensor& exp_sums,
|
|
||||||
torch::Tensor& max_logits,
|
|
||||||
torch::Tensor& tmp_out,
|
|
||||||
torch::Tensor& query,
|
|
||||||
torch::Tensor& key_cache,
|
|
||||||
torch::Tensor& value_cache,
|
|
||||||
torch::Tensor& head_mapping,
|
|
||||||
float scale,
|
|
||||||
torch::Tensor& block_tables,
|
|
||||||
torch::Tensor& context_lens,
|
|
||||||
int block_size,
|
|
||||||
int max_context_len,
|
|
||||||
const c10::optional<torch::Tensor>& alibi_slopes);
|
|
||||||
|
|
||||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
|
||||||
m.def(
|
|
||||||
"paged_attention_v1",
|
|
||||||
&paged_attention_v1,
|
|
||||||
"Compute the attention between an input query and the cached keys/values using PagedAttention.");
|
|
||||||
m.def(
|
|
||||||
"paged_attention_v2",
|
|
||||||
&paged_attention_v2,
|
|
||||||
"PagedAttention V2.");
|
|
||||||
}
|
|
||||||
@ -4,3 +4,4 @@
|
|||||||
#include "dtype_float16.cuh"
|
#include "dtype_float16.cuh"
|
||||||
#include "dtype_float32.cuh"
|
#include "dtype_float32.cuh"
|
||||||
#include "dtype_bfloat16.cuh"
|
#include "dtype_bfloat16.cuh"
|
||||||
|
#include "dtype_fp8.cuh"
|
||||||
|
|||||||
@ -1,5 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
* Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
|
* Adapted from
|
||||||
|
* https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
|
||||||
* Copyright (c) 2023, The vLLM team.
|
* Copyright (c) 2023, The vLLM team.
|
||||||
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
@ -22,31 +23,31 @@
|
|||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
// A vector type to store Q, K, V elements.
|
// A vector type to store Q, K, V elements.
|
||||||
template<typename T, int VEC_SIZE>
|
template <typename T, int VEC_SIZE>
|
||||||
struct Vec {};
|
struct Vec {};
|
||||||
|
|
||||||
// A vector type to store FP32 accumulators.
|
// A vector type to store FP32 accumulators.
|
||||||
template<typename T>
|
template <typename T>
|
||||||
struct FloatVec {};
|
struct FloatVec {};
|
||||||
|
|
||||||
// Template vector operations.
|
// Template vector operations.
|
||||||
template<typename Acc, typename A, typename B>
|
template <typename Acc, typename A, typename B>
|
||||||
inline __device__ Acc mul(A a, B b);
|
inline __device__ Acc mul(A a, B b);
|
||||||
|
|
||||||
template<typename T>
|
template <typename T>
|
||||||
inline __device__ float sum(T v);
|
inline __device__ float sum(T v);
|
||||||
|
|
||||||
template<typename T>
|
template <typename T>
|
||||||
inline __device__ float dot(T a, T b) {
|
inline __device__ float dot(T a, T b) {
|
||||||
return sum(mul<T, T, T>(a, b));
|
return sum(mul<T, T, T>(a, b));
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename A, typename T>
|
template <typename A, typename T>
|
||||||
inline __device__ float dot(T a, T b) {
|
inline __device__ float dot(T a, T b) {
|
||||||
return sum(mul<A, T, T>(a, b));
|
return sum(mul<A, T, T>(a, b));
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T>
|
template <typename T>
|
||||||
inline __device__ void zero(T& dst) {
|
inline __device__ void zero(T& dst) {
|
||||||
constexpr int WORDS = sizeof(T) / 4;
|
constexpr int WORDS = sizeof(T) / 4;
|
||||||
union {
|
union {
|
||||||
@ -61,4 +62,4 @@ inline __device__ void zero(T& dst) {
|
|||||||
dst = tmp.raw;
|
dst = tmp.raw;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -1,5 +1,6 @@
|
|||||||
/*
|
/*
|
||||||
* Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
|
* Adapted from
|
||||||
|
* https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
|
||||||
* Copyright (c) 2023, The vLLM team.
|
* Copyright (c) 2023, The vLLM team.
|
||||||
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
@ -17,6 +18,7 @@
|
|||||||
*/
|
*/
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include "../cuda_compat.h"
|
||||||
#include "attention_dtypes.h"
|
#include "attention_dtypes.h"
|
||||||
|
|
||||||
#include <float.h>
|
#include <float.h>
|
||||||
@ -25,7 +27,7 @@
|
|||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
// Q*K^T operation.
|
// Q*K^T operation.
|
||||||
template<int THREAD_GROUP_SIZE, typename Vec, int N>
|
template <int THREAD_GROUP_SIZE, typename Vec, int N>
|
||||||
inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
|
inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
|
||||||
using A_vec = typename FloatVec<Vec>::Type;
|
using A_vec = typename FloatVec<Vec>::Type;
|
||||||
// Compute the parallel products for Q*K^T (treat vector lanes separately).
|
// Compute the parallel products for Q*K^T (treat vector lanes separately).
|
||||||
@ -39,17 +41,17 @@ inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
|
|||||||
float qk = sum(qk_vec);
|
float qk = sum(qk_vec);
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) {
|
for (int mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) {
|
||||||
qk += __shfl_xor_sync(uint32_t(-1), qk, mask);
|
qk += VLLM_SHFL_XOR_SYNC(qk, mask);
|
||||||
}
|
}
|
||||||
return qk;
|
return qk;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename T, int THREAD_GROUP_SIZE>
|
template <typename T, int THREAD_GROUP_SIZE>
|
||||||
struct Qk_dot {
|
struct Qk_dot {
|
||||||
template<typename Vec, int N>
|
template <typename Vec, int N>
|
||||||
static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) {
|
static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) {
|
||||||
return qk_dot_<THREAD_GROUP_SIZE>(q, k);
|
return qk_dot_<THREAD_GROUP_SIZE>(q, k);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|||||||
@ -1,6 +1,8 @@
|
|||||||
/*
|
/*
|
||||||
* Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
|
* Adapted from
|
||||||
* and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
|
* https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
|
||||||
|
* and
|
||||||
|
* https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
|
||||||
* Copyright (c) 2023, The vLLM team.
|
* Copyright (c) 2023, The vLLM team.
|
||||||
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
@ -21,8 +23,17 @@
|
|||||||
#include "attention_generic.cuh"
|
#include "attention_generic.cuh"
|
||||||
#include "dtype_float32.cuh"
|
#include "dtype_float32.cuh"
|
||||||
|
|
||||||
#include <cuda_bf16.h>
|
#ifndef USE_ROCM
|
||||||
#include <cuda_fp16.h>
|
#include <cuda_bf16.h>
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#else
|
||||||
|
#include <hip/hip_bf16.h>
|
||||||
|
#include <hip/hip_fp16.h>
|
||||||
|
|
||||||
|
typedef __hip_bfloat162 __nv_bfloat162;
|
||||||
|
typedef __hip_bfloat16 __nv_bfloat16;
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
@ -41,37 +52,37 @@ struct bf16_8_t {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// BF16 vector types for Q, K, V.
|
// BF16 vector types for Q, K, V.
|
||||||
template<>
|
template <>
|
||||||
struct Vec<__nv_bfloat16, 1> {
|
struct Vec<__nv_bfloat16, 1> {
|
||||||
using Type = __nv_bfloat16;
|
using Type = __nv_bfloat16;
|
||||||
};
|
};
|
||||||
template<>
|
template <>
|
||||||
struct Vec<__nv_bfloat16, 2> {
|
struct Vec<__nv_bfloat16, 2> {
|
||||||
using Type = __nv_bfloat162;
|
using Type = __nv_bfloat162;
|
||||||
};
|
};
|
||||||
template<>
|
template <>
|
||||||
struct Vec<__nv_bfloat16, 4> {
|
struct Vec<__nv_bfloat16, 4> {
|
||||||
using Type = bf16_4_t;
|
using Type = bf16_4_t;
|
||||||
};
|
};
|
||||||
template<>
|
template <>
|
||||||
struct Vec<__nv_bfloat16, 8> {
|
struct Vec<__nv_bfloat16, 8> {
|
||||||
using Type = bf16_8_t;
|
using Type = bf16_8_t;
|
||||||
};
|
};
|
||||||
|
|
||||||
// FP32 accumulator vector types corresponding to Vec.
|
// FP32 accumulator vector types corresponding to Vec.
|
||||||
template<>
|
template <>
|
||||||
struct FloatVec<__nv_bfloat16> {
|
struct FloatVec<__nv_bfloat16> {
|
||||||
using Type = float;
|
using Type = float;
|
||||||
};
|
};
|
||||||
template<>
|
template <>
|
||||||
struct FloatVec<__nv_bfloat162> {
|
struct FloatVec<__nv_bfloat162> {
|
||||||
using Type = float2;
|
using Type = float2;
|
||||||
};
|
};
|
||||||
template<>
|
template <>
|
||||||
struct FloatVec<bf16_4_t> {
|
struct FloatVec<bf16_4_t> {
|
||||||
using Type = Float4_;
|
using Type = Float4_;
|
||||||
};
|
};
|
||||||
template<>
|
template <>
|
||||||
struct FloatVec<bf16_8_t> {
|
struct FloatVec<bf16_8_t> {
|
||||||
using Type = Float8_;
|
using Type = Float8_;
|
||||||
};
|
};
|
||||||
@ -98,7 +109,11 @@ inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b) {
|
|||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
|
||||||
assert(false);
|
assert(false);
|
||||||
#else
|
#else
|
||||||
|
#ifndef USE_ROCM
|
||||||
return a + b;
|
return a + b;
|
||||||
|
#else
|
||||||
|
return __hadd(a, b);
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -148,7 +163,7 @@ inline __device__ Float8_ add(bf16_8_t a, Float8_ fb) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Vector multiplication.
|
// Vector multiplication.
|
||||||
template<>
|
template <>
|
||||||
inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b) {
|
inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b) {
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
|
||||||
assert(false);
|
assert(false);
|
||||||
@ -157,7 +172,7 @@ inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
|
inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
|
||||||
assert(false);
|
assert(false);
|
||||||
@ -166,12 +181,12 @@ inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ __nv_bfloat162 mul(__nv_bfloat16 a, __nv_bfloat162 b) {
|
inline __device__ __nv_bfloat162 mul(__nv_bfloat16 a, __nv_bfloat162 b) {
|
||||||
return mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(bf162bf162(a), b);
|
return mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(bf162bf162(a), b);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ bf16_4_t mul(bf16_4_t a, bf16_4_t b) {
|
inline __device__ bf16_4_t mul(bf16_4_t a, bf16_4_t b) {
|
||||||
bf16_4_t c;
|
bf16_4_t c;
|
||||||
c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
|
c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
|
||||||
@ -179,7 +194,7 @@ inline __device__ bf16_4_t mul(bf16_4_t a, bf16_4_t b) {
|
|||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ bf16_4_t mul(__nv_bfloat16 a, bf16_4_t b) {
|
inline __device__ bf16_4_t mul(__nv_bfloat16 a, bf16_4_t b) {
|
||||||
__nv_bfloat162 s = bf162bf162(a);
|
__nv_bfloat162 s = bf162bf162(a);
|
||||||
bf16_4_t c;
|
bf16_4_t c;
|
||||||
@ -188,7 +203,7 @@ inline __device__ bf16_4_t mul(__nv_bfloat16 a, bf16_4_t b) {
|
|||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ bf16_8_t mul(bf16_8_t a, bf16_8_t b) {
|
inline __device__ bf16_8_t mul(bf16_8_t a, bf16_8_t b) {
|
||||||
bf16_8_t c;
|
bf16_8_t c;
|
||||||
c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
|
c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
|
||||||
@ -198,7 +213,7 @@ inline __device__ bf16_8_t mul(bf16_8_t a, bf16_8_t b) {
|
|||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ bf16_8_t mul(__nv_bfloat16 a, bf16_8_t b) {
|
inline __device__ bf16_8_t mul(__nv_bfloat16 a, bf16_8_t b) {
|
||||||
__nv_bfloat162 s = bf162bf162(a);
|
__nv_bfloat162 s = bf162bf162(a);
|
||||||
bf16_8_t c;
|
bf16_8_t c;
|
||||||
@ -209,26 +224,26 @@ inline __device__ bf16_8_t mul(__nv_bfloat16 a, bf16_8_t b) {
|
|||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ float mul(__nv_bfloat16 a, __nv_bfloat16 b) {
|
inline __device__ float mul(__nv_bfloat16 a, __nv_bfloat16 b) {
|
||||||
float fa = __bfloat162float(a);
|
float fa = __bfloat162float(a);
|
||||||
float fb = __bfloat162float(b);
|
float fb = __bfloat162float(b);
|
||||||
return fa * fb;
|
return fa * fb;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ float2 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
|
inline __device__ float2 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
|
||||||
float2 fa = bf1622float2(a);
|
float2 fa = bf1622float2(a);
|
||||||
float2 fb = bf1622float2(b);
|
float2 fb = bf1622float2(b);
|
||||||
return mul<float2, float2, float2>(fa, fb);
|
return mul<float2, float2, float2>(fa, fb);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ float2 mul(__nv_bfloat16 a, __nv_bfloat162 b) {
|
inline __device__ float2 mul(__nv_bfloat16 a, __nv_bfloat162 b) {
|
||||||
return mul<float2, __nv_bfloat162, __nv_bfloat162>(bf162bf162(a), b);
|
return mul<float2, __nv_bfloat162, __nv_bfloat162>(bf162bf162(a), b);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ Float4_ mul(bf16_4_t a, bf16_4_t b) {
|
inline __device__ Float4_ mul(bf16_4_t a, bf16_4_t b) {
|
||||||
Float4_ fc;
|
Float4_ fc;
|
||||||
fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
|
fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
|
||||||
@ -236,7 +251,7 @@ inline __device__ Float4_ mul(bf16_4_t a, bf16_4_t b) {
|
|||||||
return fc;
|
return fc;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ Float4_ mul(__nv_bfloat16 a, bf16_4_t b) {
|
inline __device__ Float4_ mul(__nv_bfloat16 a, bf16_4_t b) {
|
||||||
__nv_bfloat162 s = bf162bf162(a);
|
__nv_bfloat162 s = bf162bf162(a);
|
||||||
Float4_ fc;
|
Float4_ fc;
|
||||||
@ -245,7 +260,7 @@ inline __device__ Float4_ mul(__nv_bfloat16 a, bf16_4_t b) {
|
|||||||
return fc;
|
return fc;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ Float8_ mul(bf16_8_t a, bf16_8_t b) {
|
inline __device__ Float8_ mul(bf16_8_t a, bf16_8_t b) {
|
||||||
Float8_ fc;
|
Float8_ fc;
|
||||||
fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
|
fc.x = mul<float2, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
|
||||||
@ -255,7 +270,7 @@ inline __device__ Float8_ mul(bf16_8_t a, bf16_8_t b) {
|
|||||||
return fc;
|
return fc;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ Float8_ mul(__nv_bfloat16 a, bf16_8_t b) {
|
inline __device__ Float8_ mul(__nv_bfloat16 a, bf16_8_t b) {
|
||||||
__nv_bfloat162 s = bf162bf162(a);
|
__nv_bfloat162 s = bf162bf162(a);
|
||||||
Float8_ fc;
|
Float8_ fc;
|
||||||
@ -267,7 +282,8 @@ inline __device__ Float8_ mul(__nv_bfloat16 a, bf16_8_t b) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Vector fused multiply-add.
|
// Vector fused multiply-add.
|
||||||
inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
|
inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b,
|
||||||
|
__nv_bfloat162 c) {
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
|
||||||
assert(false);
|
assert(false);
|
||||||
#else
|
#else
|
||||||
@ -275,7 +291,8 @@ inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bf
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __device__ __nv_bfloat162 fma(__nv_bfloat16 a, __nv_bfloat162 b, __nv_bfloat162 c) {
|
inline __device__ __nv_bfloat162 fma(__nv_bfloat16 a, __nv_bfloat162 b,
|
||||||
|
__nv_bfloat162 c) {
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
|
||||||
assert(false);
|
assert(false);
|
||||||
#else
|
#else
|
||||||
@ -366,23 +383,23 @@ inline __device__ Float8_ fma(__nv_bfloat16 a, bf16_8_t b, Float8_ fc) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Vector sum.
|
// Vector sum.
|
||||||
template<>
|
template <>
|
||||||
inline __device__ float sum(__nv_bfloat16 v) {
|
inline __device__ float sum(__nv_bfloat16 v) {
|
||||||
return __bfloat162float(v);
|
return __bfloat162float(v);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ float sum(__nv_bfloat162 v) {
|
inline __device__ float sum(__nv_bfloat162 v) {
|
||||||
float2 vf = bf1622float2(v);
|
float2 vf = bf1622float2(v);
|
||||||
return vf.x + vf.y;
|
return vf.x + vf.y;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ float sum(bf16_4_t v) {
|
inline __device__ float sum(bf16_4_t v) {
|
||||||
return sum(v.x) + sum(v.y);
|
return sum(v.x) + sum(v.y);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ float sum(bf16_8_t v) {
|
inline __device__ float sum(bf16_8_t v) {
|
||||||
return sum(v.x) + sum(v.y) + sum(v.z) + sum(v.w);
|
return sum(v.x) + sum(v.y) + sum(v.z) + sum(v.w);
|
||||||
}
|
}
|
||||||
@ -435,4 +452,4 @@ inline __device__ void zero(__nv_bfloat16& dst) {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|||||||
@ -1,6 +1,8 @@
|
|||||||
/*
|
/*
|
||||||
* Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
|
* Adapted from
|
||||||
* and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
|
* https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
|
||||||
|
* and
|
||||||
|
* https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
|
||||||
* Copyright (c) 2023, The vLLM team.
|
* Copyright (c) 2023, The vLLM team.
|
||||||
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
@ -21,63 +23,93 @@
|
|||||||
#include "attention_generic.cuh"
|
#include "attention_generic.cuh"
|
||||||
#include "dtype_float32.cuh"
|
#include "dtype_float32.cuh"
|
||||||
|
|
||||||
|
#ifdef USE_ROCM
|
||||||
|
#include <hip/hip_fp16.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
// FP16 vector types for Q, K, V.
|
// FP16 vector types for Q, K, V.
|
||||||
template<>
|
template <>
|
||||||
struct Vec<uint16_t, 1> {
|
struct Vec<uint16_t, 1> {
|
||||||
using Type = uint16_t;
|
using Type = uint16_t;
|
||||||
};
|
};
|
||||||
template<>
|
template <>
|
||||||
struct Vec<uint16_t, 2> {
|
struct Vec<uint16_t, 2> {
|
||||||
using Type = uint32_t;
|
using Type = uint32_t;
|
||||||
};
|
};
|
||||||
template<>
|
template <>
|
||||||
struct Vec<uint16_t, 4> {
|
struct Vec<uint16_t, 4> {
|
||||||
using Type = uint2;
|
using Type = uint2;
|
||||||
};
|
};
|
||||||
template<>
|
template <>
|
||||||
struct Vec<uint16_t, 8> {
|
struct Vec<uint16_t, 8> {
|
||||||
using Type = uint4;
|
using Type = uint4;
|
||||||
};
|
};
|
||||||
|
|
||||||
// FP32 accumulator vector types corresponding to Vec.
|
// FP32 accumulator vector types corresponding to Vec.
|
||||||
template<>
|
template <>
|
||||||
struct FloatVec<uint16_t> {
|
struct FloatVec<uint16_t> {
|
||||||
using Type = float;
|
using Type = float;
|
||||||
};
|
};
|
||||||
template<>
|
template <>
|
||||||
struct FloatVec<uint32_t> {
|
struct FloatVec<uint32_t> {
|
||||||
using Type = float2;
|
using Type = float2;
|
||||||
};
|
};
|
||||||
template<>
|
template <>
|
||||||
struct FloatVec<uint2> {
|
struct FloatVec<uint2> {
|
||||||
using Type = Float4_;
|
using Type = Float4_;
|
||||||
};
|
};
|
||||||
template<>
|
template <>
|
||||||
struct FloatVec<uint4> {
|
struct FloatVec<uint4> {
|
||||||
using Type = Float8_;
|
using Type = Float8_;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Utility functions for type conversions.
|
// Utility functions for type conversions.
|
||||||
inline __device__ uint32_t h0_h0(uint16_t a) {
|
inline __device__ uint32_t h0_h0(uint16_t a) {
|
||||||
|
#ifndef USE_ROCM
|
||||||
uint32_t b;
|
uint32_t b;
|
||||||
asm volatile("mov.b32 %0, {%1, %1};" : "=r"(b) : "h"(a));
|
asm volatile("mov.b32 %0, {%1, %1};" : "=r"(b) : "h"(a));
|
||||||
return b;
|
return b;
|
||||||
|
#else
|
||||||
|
union {
|
||||||
|
uint32_t u32;
|
||||||
|
uint16_t u16[2];
|
||||||
|
} tmp;
|
||||||
|
tmp.u16[0] = a;
|
||||||
|
tmp.u16[1] = a;
|
||||||
|
return tmp.u32;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __device__ float half_to_float(uint16_t h) {
|
inline __device__ float half_to_float(uint16_t h) {
|
||||||
float f;
|
float f;
|
||||||
|
#ifndef USE_ROCM
|
||||||
asm volatile("cvt.f32.f16 %0, %1;\n" : "=f"(f) : "h"(h));
|
asm volatile("cvt.f32.f16 %0, %1;\n" : "=f"(f) : "h"(h));
|
||||||
|
#else
|
||||||
|
asm volatile("v_cvt_f32_f16 %0, %1;" : "=v"(f) : "v"(h));
|
||||||
|
#endif
|
||||||
return f;
|
return f;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __device__ float2 half2_to_float2(uint32_t v) {
|
inline __device__ float2 half2_to_float2(uint32_t v) {
|
||||||
|
#ifndef USE_ROCM
|
||||||
uint16_t lo, hi;
|
uint16_t lo, hi;
|
||||||
asm volatile("mov.b32 {%0, %1}, %2;\n" : "=h"(lo), "=h"(hi) : "r"(v));
|
asm volatile("mov.b32 {%0, %1}, %2;\n" : "=h"(lo), "=h"(hi) : "r"(v));
|
||||||
return make_float2(half_to_float(lo), half_to_float(hi));
|
return make_float2(half_to_float(lo), half_to_float(hi));
|
||||||
|
#else
|
||||||
|
union {
|
||||||
|
uint32_t u32;
|
||||||
|
uint16_t u16[2];
|
||||||
|
} tmp;
|
||||||
|
tmp.u32 = v;
|
||||||
|
float2 ret;
|
||||||
|
ret.x = half_to_float(tmp.u16[0]);
|
||||||
|
ret.y = half_to_float(tmp.u16[1]);
|
||||||
|
return ret;
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __device__ uint16_t float_to_half(float f) {
|
inline __device__ uint16_t float_to_half(float f) {
|
||||||
@ -85,7 +117,11 @@ inline __device__ uint16_t float_to_half(float f) {
|
|||||||
uint32_t u32;
|
uint32_t u32;
|
||||||
uint16_t u16[2];
|
uint16_t u16[2];
|
||||||
} tmp;
|
} tmp;
|
||||||
|
#ifndef USE_ROCM
|
||||||
asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f));
|
asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f));
|
||||||
|
#else
|
||||||
|
asm volatile("v_cvt_f16_f32 %0, %1;\n" : "=v"(tmp.u32) : "v"(f));
|
||||||
|
#endif
|
||||||
return tmp.u16[0];
|
return tmp.u16[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -94,12 +130,18 @@ inline __device__ uint32_t float2_to_half2(float2 f) {
|
|||||||
uint32_t u32;
|
uint32_t u32;
|
||||||
uint16_t u16[2];
|
uint16_t u16[2];
|
||||||
} tmp;
|
} tmp;
|
||||||
|
#ifndef USE_ROCM
|
||||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
|
||||||
asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(f.y), "f"(f.x));
|
asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n"
|
||||||
#else
|
: "=r"(tmp.u32)
|
||||||
|
: "f"(f.y), "f"(f.x));
|
||||||
|
#else
|
||||||
asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
|
asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
|
||||||
asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
|
asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
tmp.u16[0] = float_to_half(f.x);
|
||||||
|
tmp.u16[1] = float_to_half(f.y);
|
||||||
#endif
|
#endif
|
||||||
return tmp.u32;
|
return tmp.u32;
|
||||||
}
|
}
|
||||||
@ -107,13 +149,21 @@ inline __device__ uint32_t float2_to_half2(float2 f) {
|
|||||||
// Vector addition.
|
// Vector addition.
|
||||||
inline __device__ uint16_t add(uint16_t a, uint16_t b) {
|
inline __device__ uint16_t add(uint16_t a, uint16_t b) {
|
||||||
uint16_t c;
|
uint16_t c;
|
||||||
|
#ifndef USE_ROCM
|
||||||
asm volatile("add.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
|
asm volatile("add.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
|
||||||
|
#else
|
||||||
|
asm volatile("v_add_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b));
|
||||||
|
#endif
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline __device__ uint32_t add(uint32_t a, uint32_t b) {
|
inline __device__ uint32_t add(uint32_t a, uint32_t b) {
|
||||||
uint32_t c;
|
uint32_t c;
|
||||||
|
#ifndef USE_ROCM
|
||||||
asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
|
asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
|
||||||
|
#else
|
||||||
|
asm volatile("v_pk_add_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b));
|
||||||
|
#endif
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -155,26 +205,34 @@ inline __device__ Float8_ add(uint4 a, Float8_ fb) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Vector multiplication.
|
// Vector multiplication.
|
||||||
template<>
|
template <>
|
||||||
inline __device__ uint16_t mul(uint16_t a, uint16_t b) {
|
inline __device__ uint16_t mul(uint16_t a, uint16_t b) {
|
||||||
uint16_t c;
|
uint16_t c;
|
||||||
|
#ifndef USE_ROCM
|
||||||
asm volatile("mul.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
|
asm volatile("mul.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
|
||||||
|
#else
|
||||||
|
asm volatile("v_mul_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b));
|
||||||
|
#endif
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ uint32_t mul(uint32_t a, uint32_t b) {
|
inline __device__ uint32_t mul(uint32_t a, uint32_t b) {
|
||||||
uint32_t c;
|
uint32_t c;
|
||||||
|
#ifndef USE_ROCM
|
||||||
asm volatile("mul.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
|
asm volatile("mul.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
|
||||||
|
#else
|
||||||
|
asm volatile("v_pk_mul_f16 %0, %1, %2;\n" : "=v"(c) : "v"(a), "v"(b));
|
||||||
|
#endif
|
||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ uint32_t mul(uint16_t a, uint32_t b) {
|
inline __device__ uint32_t mul(uint16_t a, uint32_t b) {
|
||||||
return mul<uint32_t, uint32_t, uint32_t>(h0_h0(a), b);
|
return mul<uint32_t, uint32_t, uint32_t>(h0_h0(a), b);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ uint2 mul(uint2 a, uint2 b) {
|
inline __device__ uint2 mul(uint2 a, uint2 b) {
|
||||||
uint2 c;
|
uint2 c;
|
||||||
c.x = mul<uint32_t, uint32_t, uint32_t>(a.x, b.x);
|
c.x = mul<uint32_t, uint32_t, uint32_t>(a.x, b.x);
|
||||||
@ -182,7 +240,7 @@ inline __device__ uint2 mul(uint2 a, uint2 b) {
|
|||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ uint2 mul(uint16_t a, uint2 b) {
|
inline __device__ uint2 mul(uint16_t a, uint2 b) {
|
||||||
uint32_t s = h0_h0(a);
|
uint32_t s = h0_h0(a);
|
||||||
uint2 c;
|
uint2 c;
|
||||||
@ -191,7 +249,7 @@ inline __device__ uint2 mul(uint16_t a, uint2 b) {
|
|||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ uint4 mul(uint4 a, uint4 b) {
|
inline __device__ uint4 mul(uint4 a, uint4 b) {
|
||||||
uint4 c;
|
uint4 c;
|
||||||
c.x = mul<uint32_t, uint32_t, uint32_t>(a.x, b.x);
|
c.x = mul<uint32_t, uint32_t, uint32_t>(a.x, b.x);
|
||||||
@ -201,7 +259,7 @@ inline __device__ uint4 mul(uint4 a, uint4 b) {
|
|||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ uint4 mul(uint16_t a, uint4 b) {
|
inline __device__ uint4 mul(uint16_t a, uint4 b) {
|
||||||
uint32_t s = h0_h0(a);
|
uint32_t s = h0_h0(a);
|
||||||
uint4 c;
|
uint4 c;
|
||||||
@ -212,26 +270,26 @@ inline __device__ uint4 mul(uint16_t a, uint4 b) {
|
|||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ float mul(uint16_t a, uint16_t b) {
|
inline __device__ float mul(uint16_t a, uint16_t b) {
|
||||||
float fa = half_to_float(a);
|
float fa = half_to_float(a);
|
||||||
float fb = half_to_float(b);
|
float fb = half_to_float(b);
|
||||||
return fa * fb;
|
return fa * fb;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ float2 mul(uint32_t a, uint32_t b) {
|
inline __device__ float2 mul(uint32_t a, uint32_t b) {
|
||||||
float2 fa = half2_to_float2(a);
|
float2 fa = half2_to_float2(a);
|
||||||
float2 fb = half2_to_float2(b);
|
float2 fb = half2_to_float2(b);
|
||||||
return mul<float2, float2, float2>(fa, fb);
|
return mul<float2, float2, float2>(fa, fb);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ float2 mul(uint16_t a, uint32_t b) {
|
inline __device__ float2 mul(uint16_t a, uint32_t b) {
|
||||||
return mul<float2, uint32_t, uint32_t>(h0_h0(a), b);
|
return mul<float2, uint32_t, uint32_t>(h0_h0(a), b);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ Float4_ mul(uint2 a, uint2 b) {
|
inline __device__ Float4_ mul(uint2 a, uint2 b) {
|
||||||
Float4_ fc;
|
Float4_ fc;
|
||||||
fc.x = mul<float2, uint32_t, uint32_t>(a.x, b.x);
|
fc.x = mul<float2, uint32_t, uint32_t>(a.x, b.x);
|
||||||
@ -239,7 +297,7 @@ inline __device__ Float4_ mul(uint2 a, uint2 b) {
|
|||||||
return fc;
|
return fc;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ Float4_ mul(uint16_t a, uint2 b) {
|
inline __device__ Float4_ mul(uint16_t a, uint2 b) {
|
||||||
uint32_t s = h0_h0(a);
|
uint32_t s = h0_h0(a);
|
||||||
Float4_ fc;
|
Float4_ fc;
|
||||||
@ -248,7 +306,7 @@ inline __device__ Float4_ mul(uint16_t a, uint2 b) {
|
|||||||
return fc;
|
return fc;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ Float8_ mul(uint4 a, uint4 b) {
|
inline __device__ Float8_ mul(uint4 a, uint4 b) {
|
||||||
Float8_ fc;
|
Float8_ fc;
|
||||||
fc.x = mul<float2, uint32_t, uint32_t>(a.x, b.x);
|
fc.x = mul<float2, uint32_t, uint32_t>(a.x, b.x);
|
||||||
@ -258,7 +316,7 @@ inline __device__ Float8_ mul(uint4 a, uint4 b) {
|
|||||||
return fc;
|
return fc;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ Float8_ mul(uint16_t a, uint4 b) {
|
inline __device__ Float8_ mul(uint16_t a, uint4 b) {
|
||||||
uint32_t s = h0_h0(a);
|
uint32_t s = h0_h0(a);
|
||||||
Float8_ fc;
|
Float8_ fc;
|
||||||
@ -272,7 +330,15 @@ inline __device__ Float8_ mul(uint16_t a, uint4 b) {
|
|||||||
// Vector fused multiply-add.
|
// Vector fused multiply-add.
|
||||||
inline __device__ uint32_t fma(uint32_t a, uint32_t b, uint32_t c) {
|
inline __device__ uint32_t fma(uint32_t a, uint32_t b, uint32_t c) {
|
||||||
uint32_t d;
|
uint32_t d;
|
||||||
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "r"(b), "r"(c));
|
#ifndef USE_ROCM
|
||||||
|
asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
|
||||||
|
: "=r"(d)
|
||||||
|
: "r"(a), "r"(b), "r"(c));
|
||||||
|
#else
|
||||||
|
asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n"
|
||||||
|
: "=v"(d)
|
||||||
|
: "v"(a), "v"(b), "v"(c));
|
||||||
|
#endif
|
||||||
return d;
|
return d;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -365,24 +431,24 @@ inline __device__ Float8_ fma(uint16_t a, uint4 b, Float8_ fc) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Vector sum.
|
// Vector sum.
|
||||||
template<>
|
template <>
|
||||||
inline __device__ float sum(uint16_t v) {
|
inline __device__ float sum(uint16_t v) {
|
||||||
return half_to_float(v);
|
return half_to_float(v);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ float sum(uint32_t v) {
|
inline __device__ float sum(uint32_t v) {
|
||||||
float2 tmp = half2_to_float2(v);
|
float2 tmp = half2_to_float2(v);
|
||||||
return tmp.x + tmp.y;
|
return tmp.x + tmp.y;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ float sum(uint2 v) {
|
inline __device__ float sum(uint2 v) {
|
||||||
uint32_t c = add(v.x, v.y);
|
uint32_t c = add(v.x, v.y);
|
||||||
return sum(c);
|
return sum(c);
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ float sum(uint4 v) {
|
inline __device__ float sum(uint4 v) {
|
||||||
uint32_t c = add(v.x, v.y);
|
uint32_t c = add(v.x, v.y);
|
||||||
c = add(c, v.z);
|
c = add(c, v.z);
|
||||||
@ -412,13 +478,9 @@ inline __device__ void from_float(uint4& dst, Float8_ src) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// From float16 to float32.
|
// From float16 to float32.
|
||||||
inline __device__ float to_float(uint16_t u) {
|
inline __device__ float to_float(uint16_t u) { return half_to_float(u); }
|
||||||
return half_to_float(u);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline __device__ float2 to_float(uint32_t u) {
|
inline __device__ float2 to_float(uint32_t u) { return half2_to_float2(u); }
|
||||||
return half2_to_float2(u);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline __device__ Float4_ to_float(uint2 u) {
|
inline __device__ Float4_ to_float(uint2 u) {
|
||||||
Float4_ tmp;
|
Float4_ tmp;
|
||||||
@ -437,8 +499,6 @@ inline __device__ Float8_ to_float(uint4 u) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Zero-out a variable.
|
// Zero-out a variable.
|
||||||
inline __device__ void zero(uint16_t& dst) {
|
inline __device__ void zero(uint16_t& dst) { dst = uint16_t(0); }
|
||||||
dst = uint16_t(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|||||||
@ -1,6 +1,8 @@
|
|||||||
/*
|
/*
|
||||||
* Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
|
* Adapted from
|
||||||
* and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
|
* https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
|
||||||
|
* and
|
||||||
|
* https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
|
||||||
* Copyright (c) 2023, The vLLM team.
|
* Copyright (c) 2023, The vLLM team.
|
||||||
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
|
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
|
||||||
*
|
*
|
||||||
@ -38,37 +40,35 @@ struct Float8_ {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// FP32 vector types for Q, K, V.
|
// FP32 vector types for Q, K, V.
|
||||||
template<>
|
template <>
|
||||||
struct Vec<float, 1> {
|
struct Vec<float, 1> {
|
||||||
using Type = float;
|
using Type = float;
|
||||||
};
|
};
|
||||||
template<>
|
template <>
|
||||||
struct Vec<float, 2> {
|
struct Vec<float, 2> {
|
||||||
using Type = float2;
|
using Type = float2;
|
||||||
};
|
};
|
||||||
template<>
|
template <>
|
||||||
struct Vec<float, 4> {
|
struct Vec<float, 4> {
|
||||||
using Type = float4;
|
using Type = float4;
|
||||||
};
|
};
|
||||||
|
|
||||||
// FP32 accumulator vector types corresponding to Vec.
|
// FP32 accumulator vector types corresponding to Vec.
|
||||||
template<>
|
template <>
|
||||||
struct FloatVec<float> {
|
struct FloatVec<float> {
|
||||||
using Type = float;
|
using Type = float;
|
||||||
};
|
};
|
||||||
template<>
|
template <>
|
||||||
struct FloatVec<float2> {
|
struct FloatVec<float2> {
|
||||||
using Type = float2;
|
using Type = float2;
|
||||||
};
|
};
|
||||||
template<>
|
template <>
|
||||||
struct FloatVec<float4> {
|
struct FloatVec<float4> {
|
||||||
using Type = float4;
|
using Type = float4;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Vector addition.
|
// Vector addition.
|
||||||
inline __device__ float add(float a, float b) {
|
inline __device__ float add(float a, float b) { return a + b; }
|
||||||
return a + b;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline __device__ float2 add(float2 a, float2 b) {
|
inline __device__ float2 add(float2 a, float2 b) {
|
||||||
float2 c;
|
float2 c;
|
||||||
@ -87,12 +87,12 @@ inline __device__ float4 add(float4 a, float4 b) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Vector multiplication.
|
// Vector multiplication.
|
||||||
template<>
|
template <>
|
||||||
inline __device__ float mul<float, float>(float a, float b) {
|
inline __device__ float mul<float, float>(float a, float b) {
|
||||||
return a * b;
|
return a * b;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ float2 mul(float2 a, float2 b) {
|
inline __device__ float2 mul(float2 a, float2 b) {
|
||||||
float2 c;
|
float2 c;
|
||||||
c.x = a.x * b.x;
|
c.x = a.x * b.x;
|
||||||
@ -100,7 +100,7 @@ inline __device__ float2 mul(float2 a, float2 b) {
|
|||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ float2 mul(float a, float2 b) {
|
inline __device__ float2 mul(float a, float2 b) {
|
||||||
float2 c;
|
float2 c;
|
||||||
c.x = a * b.x;
|
c.x = a * b.x;
|
||||||
@ -108,7 +108,7 @@ inline __device__ float2 mul(float a, float2 b) {
|
|||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ float4 mul(float4 a, float4 b) {
|
inline __device__ float4 mul(float4 a, float4 b) {
|
||||||
float4 c;
|
float4 c;
|
||||||
c.x = a.x * b.x;
|
c.x = a.x * b.x;
|
||||||
@ -118,7 +118,7 @@ inline __device__ float4 mul(float4 a, float4 b) {
|
|||||||
return c;
|
return c;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ float4 mul(float a, float4 b) {
|
inline __device__ float4 mul(float a, float4 b) {
|
||||||
float4 c;
|
float4 c;
|
||||||
c.x = a * b.x;
|
c.x = a * b.x;
|
||||||
@ -129,9 +129,7 @@ inline __device__ float4 mul(float a, float4 b) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Vector fused multiply-add.
|
// Vector fused multiply-add.
|
||||||
inline __device__ float fma(float a, float b, float c) {
|
inline __device__ float fma(float a, float b, float c) { return a * b + c; }
|
||||||
return a * b + c;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline __device__ float2 fma(float2 a, float2 b, float2 c) {
|
inline __device__ float2 fma(float2 a, float2 b, float2 c) {
|
||||||
float2 d;
|
float2 d;
|
||||||
@ -182,35 +180,33 @@ inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Vector sum.
|
// Vector sum.
|
||||||
template<>
|
template <>
|
||||||
inline __device__ float sum(float v) {
|
inline __device__ float sum(float v) {
|
||||||
return v;
|
return v;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ float sum(float2 v) {
|
inline __device__ float sum(float2 v) {
|
||||||
return v.x + v.y;
|
return v.x + v.y;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ float sum(float4 v) {
|
inline __device__ float sum(float4 v) {
|
||||||
return v.x + v.y + v.z + v.w;
|
return v.x + v.y + v.z + v.w;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ float sum(Float4_ v) {
|
inline __device__ float sum(Float4_ v) {
|
||||||
return v.x.x + v.x.y + v.y.x + v.y.y;
|
return v.x.x + v.x.y + v.y.x + v.y.y;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<>
|
template <>
|
||||||
inline __device__ float sum(Float8_ v) {
|
inline __device__ float sum(Float8_ v) {
|
||||||
return v.x.x + v.x.y + v.y.x + v.y.y + v.z.x + v.z.y + v.w.x + v.w.y;
|
return v.x.x + v.x.y + v.y.x + v.y.y + v.z.x + v.z.y + v.w.x + v.w.y;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Vector dot product.
|
// Vector dot product.
|
||||||
inline __device__ float dot(float a, float b) {
|
inline __device__ float dot(float a, float b) { return a * b; }
|
||||||
return a * b;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline __device__ float dot(float2 a, float2 b) {
|
inline __device__ float dot(float2 a, float2 b) {
|
||||||
float2 c = mul<float2, float2, float2>(a, b);
|
float2 c = mul<float2, float2, float2>(a, b);
|
||||||
@ -232,42 +228,24 @@ inline __device__ float dot(Float8_ a, Float8_ b) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// From float to float.
|
// From float to float.
|
||||||
inline __device__ void from_float(float& dst, float src) {
|
inline __device__ void from_float(float& dst, float src) { dst = src; }
|
||||||
dst = src;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline __device__ void from_float(float2& dst, float2 src) {
|
inline __device__ void from_float(float2& dst, float2 src) { dst = src; }
|
||||||
dst = src;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline __device__ void from_float(float4& dst, float4 src) {
|
inline __device__ void from_float(float4& dst, float4 src) { dst = src; }
|
||||||
dst = src;
|
|
||||||
}
|
|
||||||
|
|
||||||
// From float to float.
|
// From float to float.
|
||||||
inline __device__ float to_float(float u) {
|
inline __device__ float to_float(float u) { return u; }
|
||||||
return u;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline __device__ float2 to_float(float2 u) {
|
inline __device__ float2 to_float(float2 u) { return u; }
|
||||||
return u;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline __device__ float4 to_float(float4 u) {
|
inline __device__ float4 to_float(float4 u) { return u; }
|
||||||
return u;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline __device__ Float4_ to_float(Float4_ u) {
|
inline __device__ Float4_ to_float(Float4_ u) { return u; }
|
||||||
return u;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline __device__ Float8_ to_float(Float8_ u) {
|
inline __device__ Float8_ to_float(Float8_ u) { return u; }
|
||||||
return u;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Zero-out a variable.
|
// Zero-out a variable.
|
||||||
inline __device__ void zero(float& dst) {
|
inline __device__ void zero(float& dst) { dst = 0.f; }
|
||||||
dst = 0.f;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|||||||
41
csrc/attention/dtype_fp8.cuh
Normal file
41
csrc/attention/dtype_fp8.cuh
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "attention_generic.cuh"
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#ifdef ENABLE_FP8
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
#include <cuda_fp8.h>
|
||||||
|
#endif // USE_ROCM
|
||||||
|
#endif // ENABLE_FP8
|
||||||
|
|
||||||
|
namespace vllm {
|
||||||
|
|
||||||
|
enum class Fp8KVCacheDataType {
|
||||||
|
kAuto = 0,
|
||||||
|
kFp8E4M3 = 1,
|
||||||
|
kFp8E5M2 = 2,
|
||||||
|
};
|
||||||
|
|
||||||
|
// fp8 vector types for quantization of kv cache
|
||||||
|
template <>
|
||||||
|
struct Vec<uint8_t, 1> {
|
||||||
|
using Type = uint8_t;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct Vec<uint8_t, 2> {
|
||||||
|
using Type = uint16_t;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct Vec<uint8_t, 4> {
|
||||||
|
using Type = uint32_t;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct Vec<uint8_t, 8> {
|
||||||
|
using Type = uint2;
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace vllm
|
||||||
@ -1,47 +0,0 @@
|
|||||||
#include <torch/extension.h>
|
|
||||||
|
|
||||||
#include <map>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
void swap_blocks(
|
|
||||||
torch::Tensor& src,
|
|
||||||
torch::Tensor& dst,
|
|
||||||
const std::map<int64_t, int64_t>& block_mapping);
|
|
||||||
|
|
||||||
void copy_blocks(
|
|
||||||
std::vector<torch::Tensor>& key_caches,
|
|
||||||
std::vector<torch::Tensor>& value_caches,
|
|
||||||
const std::map<int64_t, std::vector<int64_t>>& block_mapping);
|
|
||||||
|
|
||||||
void reshape_and_cache(
|
|
||||||
torch::Tensor& key,
|
|
||||||
torch::Tensor& value,
|
|
||||||
torch::Tensor& key_cache,
|
|
||||||
torch::Tensor& value_cache,
|
|
||||||
torch::Tensor& slot_mapping);
|
|
||||||
|
|
||||||
void gather_cached_kv(
|
|
||||||
torch::Tensor& key,
|
|
||||||
torch::Tensor& value,
|
|
||||||
torch::Tensor& key_cache,
|
|
||||||
torch::Tensor& value_cache,
|
|
||||||
torch::Tensor& slot_mapping);
|
|
||||||
|
|
||||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
|
||||||
m.def(
|
|
||||||
"swap_blocks",
|
|
||||||
&swap_blocks,
|
|
||||||
"Swap in (out) the cache blocks from src to dst");
|
|
||||||
m.def(
|
|
||||||
"copy_blocks",
|
|
||||||
©_blocks,
|
|
||||||
"Copy the cache blocks from src to dst");
|
|
||||||
m.def(
|
|
||||||
"reshape_and_cache",
|
|
||||||
&reshape_and_cache,
|
|
||||||
"Reshape the key and value tensors and cache them");
|
|
||||||
m.def(
|
|
||||||
"gather_cached_kv",
|
|
||||||
&gather_cached_kv,
|
|
||||||
"Gather key and value from the cache into contiguous QKV tensors");
|
|
||||||
}
|
|
||||||
32
csrc/cache.h
Normal file
32
csrc/cache.h
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <torch/all.h>
|
||||||
|
|
||||||
|
#include <map>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
|
||||||
|
const torch::Tensor& block_mapping);
|
||||||
|
|
||||||
|
// Note: the key_caches and value_caches vectors are constant but
|
||||||
|
// not the Tensors they contain. The vectors need to be const refs
|
||||||
|
// in order to satisfy pytorch's C++ operator registration code.
|
||||||
|
void copy_blocks(std::vector<torch::Tensor> const& key_caches,
|
||||||
|
std::vector<torch::Tensor> const& value_caches,
|
||||||
|
const torch::Tensor& block_mapping);
|
||||||
|
|
||||||
|
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
|
||||||
|
torch::Tensor& key_cache, torch::Tensor& value_cache,
|
||||||
|
torch::Tensor& slot_mapping,
|
||||||
|
const std::string& kv_cache_dtype,
|
||||||
|
const double kv_scale);
|
||||||
|
|
||||||
|
void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
|
||||||
|
torch::Tensor& key_cache,
|
||||||
|
torch::Tensor& value_cache,
|
||||||
|
torch::Tensor& slot_mapping,
|
||||||
|
const std::string& kv_cache_dtype);
|
||||||
|
|
||||||
|
// Just for unittest
|
||||||
|
void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
|
||||||
|
const double scale, const std::string& kv_cache_dtype);
|
||||||
@ -1,24 +1,34 @@
|
|||||||
#include <torch/extension.h>
|
#include <torch/all.h>
|
||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
|
||||||
|
#include "cuda_compat.h"
|
||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
|
|
||||||
|
#ifdef USE_ROCM
|
||||||
|
#include "quantization/fp8/amd/quant_utils.cuh"
|
||||||
|
#else
|
||||||
|
#include "quantization/fp8/nvidia/quant_utils.cuh"
|
||||||
|
#endif
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
void swap_blocks(
|
#ifdef USE_ROCM
|
||||||
torch::Tensor& src,
|
#include <hip/hip_bf16.h>
|
||||||
torch::Tensor& dst,
|
typedef __hip_bfloat16 __nv_bfloat16;
|
||||||
const std::map<int64_t, int64_t>& block_mapping) {
|
#endif
|
||||||
|
|
||||||
|
void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
|
||||||
|
const torch::Tensor& block_mapping) {
|
||||||
torch::Device src_device = src.device();
|
torch::Device src_device = src.device();
|
||||||
torch::Device dst_device = dst.device();
|
torch::Device dst_device = dst.device();
|
||||||
cudaMemcpyKind memcpy_type;
|
cudaMemcpyKind memcpy_type;
|
||||||
if (src_device.is_cuda() && dst_device.is_cuda()) {
|
if (src_device.is_cuda() && dst_device.is_cuda()) {
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(src_device.index() == dst_device.index(),
|
||||||
src_device.index() == dst_device.index(),
|
"src and dst must be on the same GPU");
|
||||||
"src and dst must be on the same GPU");
|
|
||||||
memcpy_type = cudaMemcpyDeviceToDevice;
|
memcpy_type = cudaMemcpyDeviceToDevice;
|
||||||
} else if (src_device.is_cuda() && dst_device.is_cpu()) {
|
} else if (src_device.is_cuda() && dst_device.is_cpu()) {
|
||||||
memcpy_type = cudaMemcpyDeviceToHost;
|
memcpy_type = cudaMemcpyDeviceToHost;
|
||||||
@ -28,40 +38,44 @@ void swap_blocks(
|
|||||||
TORCH_CHECK(false, "Invalid device combination");
|
TORCH_CHECK(false, "Invalid device combination");
|
||||||
}
|
}
|
||||||
|
|
||||||
void *src_ptr = src.data_ptr();
|
// NOTE(youkaichao): keep in mind that `block_mapping` should be
|
||||||
void *dst_ptr = dst.data_ptr();
|
// a cpu tensor, otherwise every `item` call will require a gpu-cpu
|
||||||
|
// synchronization.
|
||||||
|
TORCH_CHECK(block_mapping.device().is_cpu(), "block_mapping must be on CPU");
|
||||||
|
|
||||||
|
char* src_ptr = static_cast<char*>(src.data_ptr());
|
||||||
|
char* dst_ptr = static_cast<char*>(dst.data_ptr());
|
||||||
|
|
||||||
const int64_t block_size_in_bytes = src.element_size() * src[0].numel();
|
const int64_t block_size_in_bytes = src.element_size() * src[0].numel();
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(
|
||||||
|
src_device.is_cuda() ? src_device : dst_device);
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
// NOTE(woosuk): This can be slow if the number of blocks is large.
|
// NOTE(woosuk): This can be slow if the number of blocks is large.
|
||||||
for (const auto& pair : block_mapping) {
|
const int64_t num_blocks = block_mapping.size(0);
|
||||||
int64_t src_block_number = pair.first;
|
for (size_t i = 0; i < num_blocks; i++) {
|
||||||
int64_t dst_block_number = pair.second;
|
int64_t src_block_number = block_mapping[i][0].item<int64_t>();
|
||||||
|
int64_t dst_block_number = block_mapping[i][1].item<int64_t>();
|
||||||
int64_t src_offset = src_block_number * block_size_in_bytes;
|
int64_t src_offset = src_block_number * block_size_in_bytes;
|
||||||
int64_t dst_offset = dst_block_number * block_size_in_bytes;
|
int64_t dst_offset = dst_block_number * block_size_in_bytes;
|
||||||
cudaMemcpyAsync(
|
cudaMemcpyAsync(dst_ptr + dst_offset, src_ptr + src_offset,
|
||||||
dst_ptr + dst_offset,
|
block_size_in_bytes, memcpy_type, stream);
|
||||||
src_ptr + src_offset,
|
|
||||||
block_size_in_bytes,
|
|
||||||
memcpy_type,
|
|
||||||
stream);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
// Grid: (num_layers, num_pairs)
|
// Grid: (num_layers, num_pairs)
|
||||||
template<typename scalar_t>
|
template <typename scalar_t>
|
||||||
__global__ void copy_blocks_kernel(
|
__global__ void copy_blocks_kernel(int64_t* key_cache_ptrs,
|
||||||
int64_t* key_cache_ptrs,
|
int64_t* value_cache_ptrs,
|
||||||
int64_t* value_cache_ptrs,
|
const int64_t* __restrict__ block_mapping,
|
||||||
const int64_t* __restrict__ block_mapping,
|
const int numel_per_block) {
|
||||||
const int numel_per_block) {
|
|
||||||
const int layer_idx = blockIdx.x;
|
const int layer_idx = blockIdx.x;
|
||||||
const int pair_idx = blockIdx.y;
|
const int pair_idx = blockIdx.y;
|
||||||
|
|
||||||
scalar_t* key_cache = reinterpret_cast<scalar_t*>(key_cache_ptrs[layer_idx]);
|
scalar_t* key_cache = reinterpret_cast<scalar_t*>(key_cache_ptrs[layer_idx]);
|
||||||
scalar_t* value_cache = reinterpret_cast<scalar_t*>(value_cache_ptrs[layer_idx]);
|
scalar_t* value_cache =
|
||||||
|
reinterpret_cast<scalar_t*>(value_cache_ptrs[layer_idx]);
|
||||||
int64_t src_block_number = block_mapping[2 * pair_idx];
|
int64_t src_block_number = block_mapping[2 * pair_idx];
|
||||||
int64_t dst_block_number = block_mapping[2 * pair_idx + 1];
|
int64_t dst_block_number = block_mapping[2 * pair_idx + 1];
|
||||||
|
|
||||||
@ -79,12 +93,14 @@ __global__ void copy_blocks_kernel(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|
||||||
void copy_blocks(
|
// Note: the key_caches and value_caches vectors are constant but
|
||||||
std::vector<torch::Tensor>& key_caches,
|
// not the Tensors they contain. The vectors need to be const refs
|
||||||
std::vector<torch::Tensor>& value_caches,
|
// in order to satisfy pytorch's C++ operator registration code.
|
||||||
const std::map<int64_t, std::vector<int64_t>>& block_mapping) {
|
void copy_blocks(std::vector<torch::Tensor> const& key_caches,
|
||||||
|
std::vector<torch::Tensor> const& value_caches,
|
||||||
|
const torch::Tensor& block_mapping) {
|
||||||
int num_layers = key_caches.size();
|
int num_layers = key_caches.size();
|
||||||
TORCH_CHECK(num_layers == value_caches.size());
|
TORCH_CHECK(num_layers == value_caches.size());
|
||||||
if (num_layers == 0) {
|
if (num_layers == 0) {
|
||||||
@ -98,60 +114,53 @@ void copy_blocks(
|
|||||||
int64_t key_cache_ptrs[num_layers];
|
int64_t key_cache_ptrs[num_layers];
|
||||||
int64_t value_cache_ptrs[num_layers];
|
int64_t value_cache_ptrs[num_layers];
|
||||||
for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
|
for (int layer_idx = 0; layer_idx < num_layers; ++layer_idx) {
|
||||||
key_cache_ptrs[layer_idx] = reinterpret_cast<int64_t>(key_caches[layer_idx].data_ptr());
|
key_cache_ptrs[layer_idx] =
|
||||||
value_cache_ptrs[layer_idx] = reinterpret_cast<int64_t>(value_caches[layer_idx].data_ptr());
|
reinterpret_cast<int64_t>(key_caches[layer_idx].data_ptr());
|
||||||
|
value_cache_ptrs[layer_idx] =
|
||||||
|
reinterpret_cast<int64_t>(value_caches[layer_idx].data_ptr());
|
||||||
}
|
}
|
||||||
// Create block mapping array.
|
|
||||||
std::vector<int64_t> block_mapping_vec;
|
// block_mapping is a 2D tensor with shape (num_pairs, 2).
|
||||||
for (const auto& pair : block_mapping) {
|
int num_pairs = block_mapping.size(0);
|
||||||
int64_t src_block_number = pair.first;
|
|
||||||
for (int64_t dst_block_number : pair.second) {
|
|
||||||
block_mapping_vec.push_back(src_block_number);
|
|
||||||
block_mapping_vec.push_back(dst_block_number);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
int64_t* block_mapping_array = block_mapping_vec.data();
|
|
||||||
int num_pairs = block_mapping_vec.size() / 2;
|
|
||||||
|
|
||||||
// Move the data structures to the GPU.
|
// Move the data structures to the GPU.
|
||||||
// NOTE: This synchronizes the CPU and GPU.
|
// NOTE: This synchronizes the CPU and GPU.
|
||||||
torch::Tensor key_cache_ptrs_tensor = torch::from_blob(
|
torch::Tensor key_cache_ptrs_tensor =
|
||||||
key_cache_ptrs, {num_layers}, torch::kInt64).to(cache_device);
|
torch::from_blob(key_cache_ptrs, {num_layers}, torch::kInt64)
|
||||||
torch::Tensor value_cache_ptrs_tensor = torch::from_blob(
|
.to(cache_device);
|
||||||
value_cache_ptrs, {num_layers}, torch::kInt64).to(cache_device);
|
torch::Tensor value_cache_ptrs_tensor =
|
||||||
torch::Tensor block_mapping_tensor = torch::from_blob(
|
torch::from_blob(value_cache_ptrs, {num_layers}, torch::kInt64)
|
||||||
block_mapping_array, {2 * num_pairs}, torch::kInt64).to(cache_device);
|
.to(cache_device);
|
||||||
|
|
||||||
// Launch the kernel.
|
// Launch the kernel.
|
||||||
const int numel_per_block = key_caches[0][0].numel();
|
const int numel_per_block = key_caches[0][0].numel();
|
||||||
dim3 grid(num_layers, num_pairs);
|
dim3 grid(num_layers, num_pairs);
|
||||||
dim3 block(std::min(1024, numel_per_block));
|
dim3 block(std::min(1024, numel_per_block));
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(cache_device);
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
VLLM_DISPATCH_FLOATING_TYPES(
|
VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(
|
||||||
key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] {
|
key_caches[0].scalar_type(), "copy_blocks_kernel", ([&] {
|
||||||
vllm::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
vllm::copy_blocks_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
||||||
key_cache_ptrs_tensor.data_ptr<int64_t>(),
|
key_cache_ptrs_tensor.data_ptr<int64_t>(),
|
||||||
value_cache_ptrs_tensor.data_ptr<int64_t>(),
|
value_cache_ptrs_tensor.data_ptr<int64_t>(),
|
||||||
block_mapping_tensor.data_ptr<int64_t>(),
|
block_mapping.data_ptr<int64_t>(), numel_per_block);
|
||||||
numel_per_block);
|
}));
|
||||||
}));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
template<typename scalar_t>
|
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
|
||||||
__global__ void reshape_and_cache_kernel(
|
__global__ void reshape_and_cache_kernel(
|
||||||
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
||||||
const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size]
|
const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size]
|
||||||
scalar_t* __restrict__ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
|
cache_t* __restrict__ key_cache, // [num_blocks, num_heads, head_size/x,
|
||||||
scalar_t* __restrict__ value_cache, // [num_blocks, num_heads, head_size, block_size]
|
// block_size, x]
|
||||||
const int64_t* __restrict__ slot_mapping, // [num_tokens]
|
cache_t* __restrict__ value_cache, // [num_blocks, num_heads, head_size,
|
||||||
const int key_stride,
|
// block_size]
|
||||||
const int value_stride,
|
const int64_t* __restrict__ slot_mapping, // [num_tokens]
|
||||||
const int num_heads,
|
const int key_stride, const int value_stride, const int num_heads,
|
||||||
const int head_size,
|
const int head_size, const int block_size, const int x,
|
||||||
const int block_size,
|
const float kv_scale) {
|
||||||
const int x) {
|
|
||||||
const int64_t token_idx = blockIdx.x;
|
const int64_t token_idx = blockIdx.x;
|
||||||
const int64_t slot_idx = slot_mapping[token_idx];
|
const int64_t slot_idx = slot_mapping[token_idx];
|
||||||
if (slot_idx < 0) {
|
if (slot_idx < 0) {
|
||||||
@ -172,29 +181,84 @@ __global__ void reshape_and_cache_kernel(
|
|||||||
const int x_idx = head_offset / x;
|
const int x_idx = head_offset / x;
|
||||||
const int x_offset = head_offset % x;
|
const int x_offset = head_offset % x;
|
||||||
|
|
||||||
const int64_t tgt_key_idx = block_idx * num_heads * (head_size / x) * block_size * x
|
const int64_t tgt_key_idx =
|
||||||
+ head_idx * (head_size / x) * block_size * x
|
block_idx * num_heads * (head_size / x) * block_size * x +
|
||||||
+ x_idx * block_size * x
|
head_idx * (head_size / x) * block_size * x + x_idx * block_size * x +
|
||||||
+ block_offset * x
|
block_offset * x + x_offset;
|
||||||
+ x_offset;
|
const int64_t tgt_value_idx =
|
||||||
const int64_t tgt_value_idx = block_idx * num_heads * head_size * block_size
|
block_idx * num_heads * head_size * block_size +
|
||||||
+ head_idx * head_size * block_size
|
head_idx * head_size * block_size + head_offset * block_size +
|
||||||
+ head_offset * block_size
|
block_offset;
|
||||||
+ block_offset;
|
scalar_t tgt_key = key[src_key_idx];
|
||||||
key_cache[tgt_key_idx] = key[src_key_idx];
|
scalar_t tgt_value = value[src_value_idx];
|
||||||
value_cache[tgt_value_idx] = value[src_value_idx];
|
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
|
||||||
|
key_cache[tgt_key_idx] = tgt_key;
|
||||||
|
value_cache[tgt_value_idx] = tgt_value;
|
||||||
|
} else {
|
||||||
|
key_cache[tgt_key_idx] =
|
||||||
|
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, kv_scale);
|
||||||
|
value_cache[tgt_value_idx] =
|
||||||
|
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, kv_scale);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace vllm
|
template <typename scalar_t>
|
||||||
|
__global__ void reshape_and_cache_flash_kernel(
|
||||||
|
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
|
||||||
|
const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size]
|
||||||
|
scalar_t* __restrict__ k_cache, // [num_blocks, block_size, num_heads,
|
||||||
|
// head_size]
|
||||||
|
scalar_t* __restrict__ v_cache, // [num_blocks, block_size, num_heads,
|
||||||
|
// head_size]
|
||||||
|
const int64_t* __restrict__ slot_mapping, // [num_tokens]
|
||||||
|
const int block_stride, const int key_stride, const int value_stride,
|
||||||
|
const int num_heads, const int head_size, const int block_size) {
|
||||||
|
const int64_t token_idx = blockIdx.x;
|
||||||
|
const int64_t slot_idx = slot_mapping[token_idx];
|
||||||
|
// NOTE: slot_idx can be -1 if the token is padded
|
||||||
|
if (slot_idx < 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const int64_t block_idx = slot_idx / block_size;
|
||||||
|
const int64_t block_offset = slot_idx % block_size;
|
||||||
|
const int n = num_heads * head_size;
|
||||||
|
for (int i = threadIdx.x; i < n; i += blockDim.x) {
|
||||||
|
const int64_t src_key_idx = token_idx * key_stride + i;
|
||||||
|
const int64_t src_value_idx = token_idx * value_stride + i;
|
||||||
|
const int head_idx = i / head_size;
|
||||||
|
const int head_offset = i % head_size;
|
||||||
|
const int64_t tgt_value_idx = block_idx * block_stride +
|
||||||
|
block_offset * num_heads * head_size +
|
||||||
|
head_idx * head_size + head_offset;
|
||||||
|
k_cache[tgt_value_idx] = key[src_key_idx];
|
||||||
|
v_cache[tgt_value_idx] = value[src_value_idx];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // namespace vllm
|
||||||
|
|
||||||
|
// KV_T is the stored data type of kv-cache.
|
||||||
|
// CACHE_T is the data type of key and value tensors.
|
||||||
|
// KV_DTYPE is the real data type of kv-cache.
|
||||||
|
#define CALL_RESHAPE_AND_CACHE(KV_T, CACHE_T, KV_DTYPE) \
|
||||||
|
vllm::reshape_and_cache_kernel<KV_T, CACHE_T, KV_DTYPE> \
|
||||||
|
<<<grid, block, 0, stream>>>( \
|
||||||
|
reinterpret_cast<KV_T*>(key.data_ptr()), \
|
||||||
|
reinterpret_cast<KV_T*>(value.data_ptr()), \
|
||||||
|
reinterpret_cast<CACHE_T*>(key_cache.data_ptr()), \
|
||||||
|
reinterpret_cast<CACHE_T*>(value_cache.data_ptr()), \
|
||||||
|
slot_mapping.data_ptr<int64_t>(), key_stride, value_stride, \
|
||||||
|
num_heads, head_size, block_size, x, kv_scale);
|
||||||
|
|
||||||
void reshape_and_cache(
|
void reshape_and_cache(
|
||||||
torch::Tensor& key, // [num_tokens, num_heads, head_size]
|
torch::Tensor& key, // [num_tokens, num_heads, head_size]
|
||||||
torch::Tensor& value, // [num_tokens, num_heads, head_size]
|
torch::Tensor& value, // [num_tokens, num_heads, head_size]
|
||||||
torch::Tensor& key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
|
torch::Tensor&
|
||||||
torch::Tensor& value_cache, // [num_blocks, num_heads, head_size, block_size]
|
key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
|
||||||
torch::Tensor& slot_mapping) // [num_tokens]
|
torch::Tensor&
|
||||||
{
|
value_cache, // [num_blocks, num_heads, head_size, block_size]
|
||||||
|
torch::Tensor& slot_mapping, // [num_tokens]
|
||||||
|
const std::string& kv_cache_dtype, const double kv_scale) {
|
||||||
int num_tokens = key.size(0);
|
int num_tokens = key.size(0);
|
||||||
int num_heads = key.size(1);
|
int num_heads = key.size(1);
|
||||||
int head_size = key.size(2);
|
int head_size = key.size(2);
|
||||||
@ -206,182 +270,120 @@ void reshape_and_cache(
|
|||||||
|
|
||||||
dim3 grid(num_tokens);
|
dim3 grid(num_tokens);
|
||||||
dim3 block(std::min(num_heads * head_size, 512));
|
dim3 block(std::min(num_heads * head_size, 512));
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
|
||||||
|
DISPATCH_BY_KV_CACHE_DTYPE(key.dtype(), kv_cache_dtype,
|
||||||
|
CALL_RESHAPE_AND_CACHE)
|
||||||
|
}
|
||||||
|
|
||||||
|
void reshape_and_cache_flash(
|
||||||
|
torch::Tensor& key, // [num_tokens, num_heads, head_size]
|
||||||
|
torch::Tensor& value, // [num_tokens, num_heads, head_size]
|
||||||
|
torch::Tensor& k_cache, // [num_blocks, block_size, num_heads, head_size]
|
||||||
|
torch::Tensor& v_cache, // [num_blocks, block_size, num_heads, head_size]
|
||||||
|
torch::Tensor& slot_mapping, // [num_tokens]
|
||||||
|
const std::string& kv_cache_dtype) {
|
||||||
|
// FIXME: only support auto datatype, does not support fp8
|
||||||
|
if (kv_cache_dtype != "auto") {
|
||||||
|
TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype);
|
||||||
|
}
|
||||||
|
int num_tokens = key.size(0);
|
||||||
|
int num_heads = key.size(1);
|
||||||
|
int head_size = key.size(2);
|
||||||
|
int block_size = k_cache.size(1);
|
||||||
|
|
||||||
|
int key_stride = key.stride(0);
|
||||||
|
int value_stride = value.stride(0);
|
||||||
|
int block_stride = k_cache.stride(0);
|
||||||
|
TORCH_CHECK(k_cache.stride(0) == v_cache.stride(0));
|
||||||
|
|
||||||
|
dim3 grid(num_tokens);
|
||||||
|
dim3 block(std::min(num_heads * head_size, 512));
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(key));
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
VLLM_DISPATCH_FLOATING_TYPES(
|
VLLM_DISPATCH_FLOATING_TYPES(
|
||||||
key.scalar_type(),
|
key.scalar_type(), "reshape_and_cache_flash", [&] {
|
||||||
"reshape_and_cache_kernel",
|
vllm::reshape_and_cache_flash_kernel<scalar_t>
|
||||||
[&] {
|
<<<grid, block, 0, stream>>>(
|
||||||
vllm::reshape_and_cache_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
|
||||||
key.data_ptr<scalar_t>(),
|
k_cache.data_ptr<scalar_t>(), v_cache.data_ptr<scalar_t>(),
|
||||||
value.data_ptr<scalar_t>(),
|
slot_mapping.data_ptr<int64_t>(), block_stride, key_stride,
|
||||||
key_cache.data_ptr<scalar_t>(),
|
value_stride, num_heads, head_size, block_size);
|
||||||
value_cache.data_ptr<scalar_t>(),
|
});
|
||||||
slot_mapping.data_ptr<int64_t>(),
|
|
||||||
key_stride,
|
|
||||||
value_stride,
|
|
||||||
num_heads,
|
|
||||||
head_size,
|
|
||||||
block_size,
|
|
||||||
x);
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
// Grid: (num_blocks, block_size).
|
template <typename Tout, typename Tin, Fp8KVCacheDataType kv_dt>
|
||||||
template<typename scalar_t>
|
__global__ void convert_fp8_kernel(const Tin* __restrict__ src_cache,
|
||||||
__global__ void gather_cached_kv_kernel(
|
Tout* __restrict__ dst_cache,
|
||||||
scalar_t* __restrict__ key, // [num_tokens, [stride], num_heads, head_size]
|
const float kv_scale,
|
||||||
scalar_t* __restrict__ value, // [num_tokens, [stride], num_heads, head_size]
|
const int64_t block_stride) {
|
||||||
const scalar_t* __restrict__ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
|
const int64_t block_idx = blockIdx.x;
|
||||||
const scalar_t* __restrict__ value_cache, // [num_blocks, num_heads, head_size, block_size]
|
for (int i = threadIdx.x; i < block_stride; i += blockDim.x) {
|
||||||
const int* __restrict__ slot_mapping, // [num_tokens]
|
int64_t idx = block_idx * block_stride + i;
|
||||||
const int key_stride,
|
dst_cache[idx] =
|
||||||
const int value_stride,
|
fp8::scaled_convert<Tout, Tin, kv_dt>(src_cache[idx], kv_scale);
|
||||||
const int num_heads,
|
}
|
||||||
const int head_size,
|
|
||||||
const int block_size,
|
|
||||||
const int x) {
|
|
||||||
const int token_idx = blockIdx.x;
|
|
||||||
const int slot_idx = slot_mapping[token_idx];
|
|
||||||
const int block_idx = slot_idx / block_size;
|
|
||||||
const int block_offset = slot_idx % block_size;
|
|
||||||
|
|
||||||
const int num_tokens = num_heads * head_size;
|
|
||||||
for (int i = threadIdx.x; i < num_tokens; i += blockDim.x) {
|
|
||||||
const int tgt_key_idx = token_idx * key_stride + i;
|
|
||||||
const int tgt_value_idx = token_idx * value_stride + i;
|
|
||||||
|
|
||||||
const int head_idx = i / head_size;
|
|
||||||
const int head_offset = i % head_size;
|
|
||||||
const int x_idx = head_offset / x; // the offset of the [head_size/x] dimension
|
|
||||||
const int x_offset = head_offset % x;
|
|
||||||
|
|
||||||
const int src_key_idx = block_idx * num_heads * (head_size / x) * block_size * x
|
|
||||||
+ head_idx * (head_size / x) * block_size * x
|
|
||||||
+ x_idx * block_size * x
|
|
||||||
+ block_offset * x
|
|
||||||
+ x_offset;
|
|
||||||
const int src_value_idx = block_idx * num_heads * head_size * block_size
|
|
||||||
+ head_idx * head_size * block_size
|
|
||||||
+ head_offset * block_size
|
|
||||||
+ block_offset;
|
|
||||||
|
|
||||||
key[tgt_key_idx] = __ldg(&key_cache[src_key_idx]);
|
|
||||||
value[tgt_value_idx] = __ldg(&value_cache[src_value_idx]);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename scalar_t>
|
} // namespace vllm
|
||||||
__global__ void gather_cached_kv_kernel_optimized(
|
|
||||||
scalar_t *__restrict__ key, // [num_tokens, [stride], num_heads, head_size]
|
|
||||||
scalar_t *__restrict__ value, // [num_tokens, [stride], num_heads, head_size]
|
|
||||||
const scalar_t *__restrict__ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
|
|
||||||
const scalar_t *__restrict__ value_cache, // [num_blocks, num_heads, head_size, block_size]
|
|
||||||
const int *__restrict__ slot_mapping, // [num_tokens]
|
|
||||||
const int key_stride,
|
|
||||||
const int value_stride,
|
|
||||||
const int num_heads,
|
|
||||||
const int head_size,
|
|
||||||
const int block_size,
|
|
||||||
const int x)
|
|
||||||
{
|
|
||||||
const int token_idx = blockIdx.x;
|
|
||||||
const int slot_idx = slot_mapping[token_idx];
|
|
||||||
const int block_idx = slot_idx / block_size;
|
|
||||||
const int block_offset = slot_idx % block_size;
|
|
||||||
|
|
||||||
const int dim = num_heads * head_size;
|
#define CALL_CONVERT_FP8(Tout, Tin, KV_DTYPE) \
|
||||||
assert(dim % 4 == 0); // this is true for known use cases
|
vllm::convert_fp8_kernel<Tout, Tin, KV_DTYPE><<<grid, block, 0, stream>>>( \
|
||||||
const int unroll_factor = 4;
|
reinterpret_cast<Tin*>(src_cache.data_ptr()), \
|
||||||
const int unrolled_dim = dim / unroll_factor;
|
reinterpret_cast<Tout*>(dst_cache.data_ptr()), kv_scale, block_stride);
|
||||||
|
|
||||||
for (int i = threadIdx.x; i < unrolled_dim; i += blockDim.x)
|
// Only for testing.
|
||||||
{
|
void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
|
||||||
int tgt_key_indices[unroll_factor];
|
const double kv_scale, const std::string& kv_cache_dtype) {
|
||||||
int tgt_value_indices[unroll_factor];
|
torch::Device src_device = src_cache.device();
|
||||||
int src_key_indices[unroll_factor];
|
torch::Device dst_device = dst_cache.device();
|
||||||
int src_value_indices[unroll_factor];
|
TORCH_CHECK(src_device.is_cuda(), "src must be on a GPU")
|
||||||
scalar_t keys_to_store[unroll_factor];
|
TORCH_CHECK(dst_device.is_cuda(), "dst must be on a GPU")
|
||||||
scalar_t values_to_store[unroll_factor];
|
TORCH_CHECK(src_device.index() == dst_device.index(),
|
||||||
|
"src and dst must be on the same GPU");
|
||||||
|
at::cuda::OptionalCUDAGuard device_guard(src_device);
|
||||||
|
|
||||||
#pragma unroll
|
int64_t num_blocks = src_cache.size(0);
|
||||||
for (int j = 0; j < unroll_factor; ++j)
|
int64_t block_stride = src_cache.stride(0);
|
||||||
{
|
|
||||||
int index = i + j * unrolled_dim;
|
|
||||||
|
|
||||||
const int tgt_key_idx = token_idx * key_stride + index;
|
dim3 grid(num_blocks);
|
||||||
const int tgt_value_idx = token_idx * value_stride + index;
|
dim3 block(std::min(block_stride, int64_t(512)));
|
||||||
|
|
||||||
const int head_idx = index / head_size;
|
|
||||||
const int head_offset = index % head_size;
|
|
||||||
const int x_idx = head_offset / x;
|
|
||||||
const int x_offset = head_offset % x;
|
|
||||||
|
|
||||||
const int src_key_idx = block_idx * num_heads * (head_size / x) * block_size * x
|
|
||||||
+ head_idx * (head_size / x) * block_size * x
|
|
||||||
+ x_idx * block_size * x
|
|
||||||
+ block_offset * x
|
|
||||||
+ x_offset;
|
|
||||||
const int src_value_idx = block_idx * num_heads * head_size * block_size
|
|
||||||
+ head_idx * head_size * block_size
|
|
||||||
+ head_offset * block_size
|
|
||||||
+ block_offset;
|
|
||||||
|
|
||||||
tgt_key_indices[j] = tgt_key_idx;
|
|
||||||
tgt_value_indices[j] = tgt_value_idx;
|
|
||||||
src_key_indices[j] = src_key_idx;
|
|
||||||
src_value_indices[j] = src_value_idx;
|
|
||||||
|
|
||||||
keys_to_store[j] = __ldg(&key_cache[src_key_idx]);
|
|
||||||
values_to_store[j] = __ldg(&value_cache[src_value_idx]);
|
|
||||||
}
|
|
||||||
|
|
||||||
#pragma unroll
|
|
||||||
for (int j = 0; j < unroll_factor; ++j)
|
|
||||||
{
|
|
||||||
key[tgt_key_indices[j]] = keys_to_store[j];
|
|
||||||
value[tgt_value_indices[j]] = values_to_store[j];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace vllm
|
|
||||||
|
|
||||||
void gather_cached_kv(
|
|
||||||
torch::Tensor& key, // [out] [num_tokens, num_heads, head_size]
|
|
||||||
torch::Tensor& value, // [out] [num_tokens, num_heads, head_size]
|
|
||||||
torch::Tensor& key_cache, // [in] [num_blocks, num_heads, head_size/x, block_size, x]
|
|
||||||
torch::Tensor& value_cache, // [in] [num_blocks, num_heads, head_size, block_size]
|
|
||||||
torch::Tensor& slot_mapping) // [in] [num_tokens]
|
|
||||||
{
|
|
||||||
int num_tokens = key.size(0);
|
|
||||||
int num_heads = key.size(1);
|
|
||||||
int head_size = key.size(2);
|
|
||||||
int block_size = key_cache.size(3);
|
|
||||||
int x = key_cache.size(4);
|
|
||||||
|
|
||||||
int key_stride = key.stride(0);
|
|
||||||
int value_stride = value.stride(0);
|
|
||||||
|
|
||||||
dim3 grid(num_tokens);
|
|
||||||
dim3 block(std::min(num_heads * head_size, 512));
|
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
VLLM_DISPATCH_FLOATING_TYPES(
|
|
||||||
key.scalar_type(),
|
if (kv_cache_dtype == "auto") {
|
||||||
"gather_cached_kv_kernel_optimized",
|
if (src_cache.dtype() == at::ScalarType::Float) {
|
||||||
[&] {
|
CALL_CONVERT_FP8(uint8_t, float, vllm::Fp8KVCacheDataType::kAuto);
|
||||||
vllm::gather_cached_kv_kernel_optimized<scalar_t><<<grid, block, 0, stream>>>(
|
} else if (src_cache.dtype() == at::ScalarType::Half) {
|
||||||
key.data_ptr<scalar_t>(),
|
CALL_CONVERT_FP8(uint8_t, uint16_t, vllm::Fp8KVCacheDataType::kAuto);
|
||||||
value.data_ptr<scalar_t>(),
|
} else if (src_cache.dtype() == at::ScalarType::BFloat16) {
|
||||||
key_cache.data_ptr<scalar_t>(),
|
CALL_CONVERT_FP8(uint8_t, __nv_bfloat16, vllm::Fp8KVCacheDataType::kAuto);
|
||||||
value_cache.data_ptr<scalar_t>(),
|
} else if (dst_cache.dtype() == at::ScalarType::Float) {
|
||||||
slot_mapping.data_ptr<int>(),
|
CALL_CONVERT_FP8(float, uint8_t, vllm::Fp8KVCacheDataType::kAuto);
|
||||||
key_stride,
|
} else if (dst_cache.dtype() == at::ScalarType::Half) {
|
||||||
value_stride,
|
CALL_CONVERT_FP8(uint16_t, uint8_t, vllm::Fp8KVCacheDataType::kAuto);
|
||||||
num_heads,
|
} else if (dst_cache.dtype() == at::ScalarType::BFloat16) {
|
||||||
head_size,
|
CALL_CONVERT_FP8(__nv_bfloat16, uint8_t, vllm::Fp8KVCacheDataType::kAuto);
|
||||||
block_size,
|
}
|
||||||
x);
|
} else if (kv_cache_dtype == "fp8" || kv_cache_dtype == "fp8_e4m3") {
|
||||||
});
|
if (src_cache.dtype() == at::ScalarType::Float) {
|
||||||
|
CALL_CONVERT_FP8(uint8_t, float, vllm::Fp8KVCacheDataType::kFp8E4M3);
|
||||||
|
} else if (src_cache.dtype() == at::ScalarType::Half) {
|
||||||
|
CALL_CONVERT_FP8(uint8_t, uint16_t, vllm::Fp8KVCacheDataType::kFp8E4M3);
|
||||||
|
} else if (src_cache.dtype() == at::ScalarType::BFloat16) {
|
||||||
|
CALL_CONVERT_FP8(uint8_t, __nv_bfloat16,
|
||||||
|
vllm::Fp8KVCacheDataType::kFp8E4M3);
|
||||||
|
} else if (dst_cache.dtype() == at::ScalarType::Float) {
|
||||||
|
CALL_CONVERT_FP8(float, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3);
|
||||||
|
} else if (dst_cache.dtype() == at::ScalarType::Half) {
|
||||||
|
CALL_CONVERT_FP8(uint16_t, uint8_t, vllm::Fp8KVCacheDataType::kFp8E4M3);
|
||||||
|
} else if (dst_cache.dtype() == at::ScalarType::BFloat16) {
|
||||||
|
CALL_CONVERT_FP8(__nv_bfloat16, uint8_t,
|
||||||
|
vllm::Fp8KVCacheDataType::kFp8E4M3);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
TORCH_CHECK(false, "Unsupported data type: ", kv_cache_dtype);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
144
csrc/cpu/activation.cpp
Normal file
144
csrc/cpu/activation.cpp
Normal file
@ -0,0 +1,144 @@
|
|||||||
|
#include "cpu_types.hpp"
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
template <typename scalar_t, vec_op::FP32Vec8 (*func)(const vec_op::FP32Vec8&),
|
||||||
|
bool is_gated>
|
||||||
|
void activation_kernel(int num_tokens, int d, scalar_t* __restrict__ input,
|
||||||
|
scalar_t* __restrict__ output) {
|
||||||
|
using scalar_vec_t = vec_op::vec_t<scalar_t>;
|
||||||
|
constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
|
||||||
|
|
||||||
|
TORCH_CHECK(d % VEC_ELEM_NUM == 0);
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (int i = 0; i < num_tokens; ++i) {
|
||||||
|
for (int j = 0; j < d; j += VEC_ELEM_NUM) {
|
||||||
|
int start = i * d;
|
||||||
|
if constexpr (is_gated) {
|
||||||
|
start *= 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
const scalar_vec_t x(input + start + j);
|
||||||
|
const vec_op::FP32Vec8 f32_x(x);
|
||||||
|
vec_op::FP32Vec8 f32_ans = func(f32_x);
|
||||||
|
|
||||||
|
if constexpr (is_gated) {
|
||||||
|
const scalar_vec_t y(input + start + d + j);
|
||||||
|
const vec_op::FP32Vec8 f32_y(y);
|
||||||
|
f32_ans = f32_y * f32_ans;
|
||||||
|
}
|
||||||
|
|
||||||
|
const scalar_vec_t result(f32_ans);
|
||||||
|
result.save(output + i * d + j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE vec_op::FP32Vec8 silu_act(const vec_op::FP32Vec8& x) {
|
||||||
|
const vec_op::FP32Vec8 zeros(0.0);
|
||||||
|
const vec_op::FP32Vec8 ones(1.0);
|
||||||
|
return x / (ones + (zeros - x).exp());
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE vec_op::FP32Vec8 gelu_new_act(const vec_op::FP32Vec8& x) {
|
||||||
|
const vec_op::FP32Vec8 ones(1.0);
|
||||||
|
const vec_op::FP32Vec8 w1(0.79788456f);
|
||||||
|
const vec_op::FP32Vec8 w2(0.044715f);
|
||||||
|
const vec_op::FP32Vec8 w3(0.5);
|
||||||
|
const vec_op::FP32Vec8 x3 = x * x * x;
|
||||||
|
const vec_op::FP32Vec8 t = (w1 * (x + w2 * x3)).tanh();
|
||||||
|
return w3 * x * (ones + t);
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE vec_op::FP32Vec8 gelu_fast_act(const vec_op::FP32Vec8& x) {
|
||||||
|
const vec_op::FP32Vec8 ones(1.0);
|
||||||
|
const vec_op::FP32Vec8 w1(0.79788456f);
|
||||||
|
const vec_op::FP32Vec8 w2(0.044715f);
|
||||||
|
const vec_op::FP32Vec8 w3(0.5);
|
||||||
|
const vec_op::FP32Vec8 t = (x * w1 * (ones + x * w2 * x)).tanh();
|
||||||
|
return w3 * x * (ones + t);
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE vec_op::FP32Vec8 gelu_act(const vec_op::FP32Vec8& x) {
|
||||||
|
const vec_op::FP32Vec8 ones(1.0);
|
||||||
|
const vec_op::FP32Vec8 w1(M_SQRT1_2);
|
||||||
|
const vec_op::FP32Vec8 w2(0.5);
|
||||||
|
return x * w2 * (ones + (x * w1).er());
|
||||||
|
}
|
||||||
|
|
||||||
|
FORCE_INLINE vec_op::FP32Vec8 gelu_tanh_act(const vec_op::FP32Vec8& x) {
|
||||||
|
const vec_op::FP32Vec8 ones(1.0);
|
||||||
|
const vec_op::FP32Vec8 w1(M_SQRT2 * M_2_SQRTPI * 0.5);
|
||||||
|
const vec_op::FP32Vec8 w2(0.5);
|
||||||
|
const vec_op::FP32Vec8 w3(0.044715);
|
||||||
|
const vec_op::FP32Vec8 x_3 = x * x * x;
|
||||||
|
const vec_op::FP32Vec8 inner = w1 * (x + x_3 * w3);
|
||||||
|
return x * w2 * (ones + inner.tanh());
|
||||||
|
}
|
||||||
|
}; // namespace
|
||||||
|
|
||||||
|
void silu_and_mul(torch::Tensor& out, torch::Tensor& input) {
|
||||||
|
int num_tokens = input.numel() / input.size(-1);
|
||||||
|
int d = input.size(-1) / 2;
|
||||||
|
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "silu_and_mul_impl", [&] {
|
||||||
|
CPU_KERNEL_GUARD_IN(silu_and_mul_impl)
|
||||||
|
activation_kernel<scalar_t, silu_act, true>(
|
||||||
|
num_tokens, d, input.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
|
||||||
|
CPU_KERNEL_GUARD_OUT(silu_and_mul_impl)
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void gelu_and_mul(torch::Tensor& out, // [..., d]
|
||||||
|
torch::Tensor& input) // [..., 2 * d]
|
||||||
|
{
|
||||||
|
int num_tokens = input.numel() / input.size(-1);
|
||||||
|
int d = input.size(-1) / 2;
|
||||||
|
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_and_mul_impl", [&] {
|
||||||
|
CPU_KERNEL_GUARD_IN(gelu_and_mul_impl)
|
||||||
|
activation_kernel<scalar_t, gelu_act, true>(
|
||||||
|
num_tokens, d, input.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
|
||||||
|
CPU_KERNEL_GUARD_OUT(gelu_and_mul_impl)
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void gelu_tanh_and_mul(torch::Tensor& out, // [..., d]
|
||||||
|
torch::Tensor& input) // [..., 2 * d]
|
||||||
|
{
|
||||||
|
int num_tokens = input.numel() / input.size(-1);
|
||||||
|
int d = input.size(-1) / 2;
|
||||||
|
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(
|
||||||
|
input.scalar_type(), "gelu_tanh_and_mul_impl", [&] {
|
||||||
|
CPU_KERNEL_GUARD_IN(gelu_tanh_and_mul_impl)
|
||||||
|
activation_kernel<scalar_t, gelu_tanh_act, true>(
|
||||||
|
num_tokens, d, input.data_ptr<scalar_t>(),
|
||||||
|
out.data_ptr<scalar_t>());
|
||||||
|
CPU_KERNEL_GUARD_OUT(gelu_tanh_and_mul_impl)
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void gelu_new(torch::Tensor& out, torch::Tensor& input) {
|
||||||
|
int num_tokens = input.numel() / input.size(-1);
|
||||||
|
int d = input.size(-1);
|
||||||
|
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_new_impl", [&] {
|
||||||
|
CPU_KERNEL_GUARD_IN(gelu_new_impl)
|
||||||
|
activation_kernel<scalar_t, gelu_new_act, false>(
|
||||||
|
num_tokens, d, input.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
|
||||||
|
CPU_KERNEL_GUARD_OUT(gelu_new_impl)
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void gelu_fast(torch::Tensor& out, torch::Tensor& input) {
|
||||||
|
int num_tokens = input.numel() / input.size(-1);
|
||||||
|
int d = input.size(-1);
|
||||||
|
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "gelu_fast_impl", [&] {
|
||||||
|
CPU_KERNEL_GUARD_IN(gelu_fast_impl)
|
||||||
|
activation_kernel<scalar_t, gelu_fast_act, false>(
|
||||||
|
num_tokens, d, input.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
|
||||||
|
CPU_KERNEL_GUARD_OUT(gelu_fast_impl)
|
||||||
|
});
|
||||||
|
}
|
||||||
758
csrc/cpu/attention.cpp
Normal file
758
csrc/cpu/attention.cpp
Normal file
@ -0,0 +1,758 @@
|
|||||||
|
#include "cpu_types.hpp"
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
template <typename scalar_t>
|
||||||
|
struct KernelVecType {
|
||||||
|
using q_load_vec_type = void;
|
||||||
|
using q_vec_type = void;
|
||||||
|
using k_load_vec_type = void;
|
||||||
|
using k_vec_type = void;
|
||||||
|
using qk_acc_vec_type = void;
|
||||||
|
using v_load_vec_type = void;
|
||||||
|
};
|
||||||
|
|
||||||
|
template <>
|
||||||
|
struct KernelVecType<float> {
|
||||||
|
using q_load_vec_type = vec_op::FP32Vec4;
|
||||||
|
using q_vec_type = vec_op::FP32Vec16;
|
||||||
|
using k_load_vec_type = vec_op::FP32Vec16;
|
||||||
|
using k_vec_type = vec_op::FP32Vec16;
|
||||||
|
using qk_acc_vec_type = vec_op::FP32Vec16;
|
||||||
|
using v_load_vec_type = vec_op::FP32Vec16;
|
||||||
|
};
|
||||||
|
|
||||||
|
#ifdef __AVX512BF16__
|
||||||
|
template <>
|
||||||
|
struct KernelVecType<c10::BFloat16> {
|
||||||
|
using q_load_vec_type = vec_op::BF16Vec8;
|
||||||
|
using q_vec_type = vec_op::BF16Vec32;
|
||||||
|
using k_load_vec_type = vec_op::BF16Vec32;
|
||||||
|
using k_vec_type = vec_op::BF16Vec32;
|
||||||
|
using qk_acc_vec_type = vec_op::FP32Vec16;
|
||||||
|
using v_load_vec_type = vec_op::BF16Vec16;
|
||||||
|
};
|
||||||
|
#else
|
||||||
|
template <>
|
||||||
|
struct KernelVecType<c10::BFloat16> {
|
||||||
|
using q_load_vec_type = vec_op::BF16Vec8;
|
||||||
|
using q_vec_type = vec_op::FP32Vec16;
|
||||||
|
using k_load_vec_type = vec_op::BF16Vec16;
|
||||||
|
using k_vec_type = vec_op::FP32Vec16;
|
||||||
|
using qk_acc_vec_type = vec_op::FP32Vec16;
|
||||||
|
using v_load_vec_type = vec_op::BF16Vec16;
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
FORCE_INLINE std::pair<T, T> reduceSoftmax(T* data, const int size,
|
||||||
|
const int capacity) {
|
||||||
|
T max = data[0];
|
||||||
|
for (int i = 1; i < size; ++i) {
|
||||||
|
max = max >= data[i] ? max : data[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
T sum = 0;
|
||||||
|
for (int i = 0; i < size; ++i) {
|
||||||
|
data[i] = std::exp(data[i] - max);
|
||||||
|
sum += data[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
int i = 0;
|
||||||
|
for (; i < size; ++i) {
|
||||||
|
data[i] /= sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; i < capacity; ++i) {
|
||||||
|
data[i] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return {max, sum};
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
FORCE_INLINE std::pair<T, T> reduceSoftmaxAlibi(T* data, const int size,
|
||||||
|
const int capacity,
|
||||||
|
const float alibi_slope,
|
||||||
|
const int start_index,
|
||||||
|
const int seq_len) {
|
||||||
|
data[0] += alibi_slope * (start_index - seq_len + 1);
|
||||||
|
T max = data[0];
|
||||||
|
for (int i = 1; i < size; ++i) {
|
||||||
|
T qk = data[i] + alibi_slope * (start_index + i - seq_len + 1);
|
||||||
|
data[i] = qk;
|
||||||
|
max = max >= qk ? max : qk;
|
||||||
|
}
|
||||||
|
|
||||||
|
T sum = 0;
|
||||||
|
for (int i = 0; i < size; ++i) {
|
||||||
|
data[i] = std::exp(data[i] - max);
|
||||||
|
sum += data[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
int i = 0;
|
||||||
|
for (; i < size; ++i) {
|
||||||
|
data[i] /= sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; i < capacity; ++i) {
|
||||||
|
data[i] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
return {max, sum};
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
FORCE_INLINE void reducePartitonSoftmax(const T* max_data, T* sum_data,
|
||||||
|
const int size) {
|
||||||
|
T max = max_data[0];
|
||||||
|
for (int i = 1; i < size; ++i) {
|
||||||
|
max = max >= max_data[i] ? max : max_data[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
T rescaled_sum = 0;
|
||||||
|
for (int i = 0; i < size; ++i) {
|
||||||
|
T rescale_factor = std::exp(max_data[i] - max);
|
||||||
|
rescaled_sum += rescale_factor * sum_data[i];
|
||||||
|
sum_data[i] *= rescale_factor;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < size; ++i) {
|
||||||
|
sum_data[i] /= rescaled_sum + 1e-8;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE, int x>
|
||||||
|
struct reduceQKBlockKernel {
|
||||||
|
using q_load_vec_type = typename KernelVecType<scalar_t>::q_load_vec_type;
|
||||||
|
using q_vec_type = typename KernelVecType<scalar_t>::q_vec_type;
|
||||||
|
using k_load_vec_type = typename KernelVecType<scalar_t>::k_load_vec_type;
|
||||||
|
using k_vec_type = typename KernelVecType<scalar_t>::k_vec_type;
|
||||||
|
using qk_acc_vec_type = typename KernelVecType<scalar_t>::qk_acc_vec_type;
|
||||||
|
|
||||||
|
constexpr static int TOKEN_PER_GROUP = k_load_vec_type::get_elem_num() / x;
|
||||||
|
constexpr static int MAX_GROUP_NUM = 16 / TOKEN_PER_GROUP;
|
||||||
|
constexpr static int UNROLL_GROUP_NUM = MAX_GROUP_NUM / 4;
|
||||||
|
|
||||||
|
static_assert(MAX_GROUP_NUM == 8 || MAX_GROUP_NUM == 4);
|
||||||
|
static_assert(k_load_vec_type::get_elem_num() % x == 0);
|
||||||
|
static_assert(q_load_vec_type::get_elem_num() * sizeof(scalar_t) == 16);
|
||||||
|
|
||||||
|
FORCE_INLINE static void call(const scalar_t* __restrict__ q,
|
||||||
|
const scalar_t* __restrict__ k_block,
|
||||||
|
float* __restrict__ logits, float scale,
|
||||||
|
const int token_num) {
|
||||||
|
const int group_num = (token_num + TOKEN_PER_GROUP - 1) / TOKEN_PER_GROUP;
|
||||||
|
|
||||||
|
qk_acc_vec_type group_accums[MAX_GROUP_NUM];
|
||||||
|
if (token_num == BLOCK_SIZE) {
|
||||||
|
for (int q_offset = 0; q_offset < HEAD_SIZE;
|
||||||
|
q_offset += x, k_block += x * BLOCK_SIZE) {
|
||||||
|
q_load_vec_type q_load_group_vec(q + q_offset);
|
||||||
|
q_vec_type q_group_vec(q_load_group_vec);
|
||||||
|
|
||||||
|
vec_op::unroll_loop<int, MAX_GROUP_NUM>(
|
||||||
|
[k_block, &q_group_vec, &group_accums](int token_group_idx) {
|
||||||
|
k_load_vec_type k_load_group_vec(k_block + token_group_idx * x *
|
||||||
|
TOKEN_PER_GROUP);
|
||||||
|
k_vec_type k_group_vec(k_load_group_vec);
|
||||||
|
vec_op::fma(group_accums[token_group_idx], q_group_vec,
|
||||||
|
k_group_vec);
|
||||||
|
vec_op::prefetch(k_block + x * BLOCK_SIZE +
|
||||||
|
token_group_idx * x * TOKEN_PER_GROUP);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int q_offset = 0; q_offset < HEAD_SIZE;
|
||||||
|
q_offset += x, k_block += x * BLOCK_SIZE) {
|
||||||
|
q_load_vec_type q_load_group_vec(q + q_offset);
|
||||||
|
q_vec_type q_group_vec(q_load_group_vec);
|
||||||
|
for (int token_group_start = 0; token_group_start < group_num;
|
||||||
|
token_group_start += UNROLL_GROUP_NUM) {
|
||||||
|
vec_op::unroll_loop<int, UNROLL_GROUP_NUM>(
|
||||||
|
[token_group_start, k_block, &q_group_vec,
|
||||||
|
&group_accums](int token_group_idx) {
|
||||||
|
token_group_idx += token_group_start;
|
||||||
|
k_load_vec_type k_load_group_vec(k_block + token_group_idx * x *
|
||||||
|
TOKEN_PER_GROUP);
|
||||||
|
k_vec_type k_group_vec(k_load_group_vec);
|
||||||
|
vec_op::fma(group_accums[token_group_idx], q_group_vec,
|
||||||
|
k_group_vec);
|
||||||
|
vec_op::prefetch(k_block + x * BLOCK_SIZE +
|
||||||
|
token_group_idx * x * TOKEN_PER_GROUP);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int token_group_idx = 0; token_group_idx < group_num;
|
||||||
|
++token_group_idx) {
|
||||||
|
vec_op::unroll_loop<int, TOKEN_PER_GROUP>(
|
||||||
|
[&group_accums, logits, scale, token_group_idx](int token_idx) {
|
||||||
|
float dot_v =
|
||||||
|
group_accums[token_group_idx]
|
||||||
|
.template reduce_sub_sum<qk_acc_vec_type::get_elem_num() /
|
||||||
|
TOKEN_PER_GROUP>(token_idx);
|
||||||
|
logits[token_group_idx * TOKEN_PER_GROUP + token_idx] =
|
||||||
|
dot_v * scale;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE,
|
||||||
|
int HEAD_PARTITION_SIZE, typename acc_t>
|
||||||
|
FORCE_INLINE void reduceValueBlock(const float* prob, const scalar_t* v_block,
|
||||||
|
acc_t&& acc) {
|
||||||
|
using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
|
||||||
|
constexpr int ELEM_NUM = v_load_vec_type::get_elem_num();
|
||||||
|
static_assert(BLOCK_SIZE == ELEM_NUM);
|
||||||
|
vec_op::FP32Vec16 prob_vec(prob);
|
||||||
|
|
||||||
|
vec_op::unroll_loop<int, HEAD_PARTITION_SIZE>([&](int head_elem_idx) {
|
||||||
|
v_load_vec_type v_vec(v_block + BLOCK_SIZE * head_elem_idx);
|
||||||
|
vec_op::FP32Vec16 fp32_v_vec(v_vec);
|
||||||
|
acc[head_elem_idx] = acc[head_elem_idx] + prob_vec * fp32_v_vec;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}; // namespace
|
||||||
|
|
||||||
|
// Paged attention v1
|
||||||
|
namespace {
|
||||||
|
template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE>
|
||||||
|
struct paged_attention_v1_impl {
|
||||||
|
static void call(
|
||||||
|
scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
|
||||||
|
const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
|
||||||
|
const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads,
|
||||||
|
// head_size/x, block_size, x]
|
||||||
|
const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads,
|
||||||
|
// head_size, block_size]
|
||||||
|
const int num_kv_heads, const float scale,
|
||||||
|
const int* __restrict__ block_tables, // [num_seqs,
|
||||||
|
// max_num_blocks_per_seq]
|
||||||
|
const int* __restrict__ seq_lens, // [num_seqs]
|
||||||
|
const int max_num_blocks_per_seq,
|
||||||
|
const float* __restrict__ alibi_slopes, // [num_heads]
|
||||||
|
const int q_stride, const int kv_block_stride, const int kv_head_stride,
|
||||||
|
const int num_seqs, const int num_heads) {
|
||||||
|
constexpr int x = 16 / sizeof(scalar_t);
|
||||||
|
const int num_queries_per_kv = num_heads / num_kv_heads;
|
||||||
|
|
||||||
|
static_assert(BLOCK_SIZE == 16);
|
||||||
|
|
||||||
|
int max_seq_len = max_num_blocks_per_seq * BLOCK_SIZE;
|
||||||
|
int max_seq_len_padded = (max_seq_len + 15) & 0xFFFFFFF0;
|
||||||
|
TORCH_CHECK((max_seq_len_padded * sizeof(float)) % 64 == 0);
|
||||||
|
|
||||||
|
const int parallel_work_item_num = omp_get_max_threads();
|
||||||
|
|
||||||
|
size_t logits_bytes =
|
||||||
|
parallel_work_item_num * max_seq_len_padded * sizeof(float);
|
||||||
|
float* logits = (float*)std::aligned_alloc(
|
||||||
|
64, logits_bytes); // Cacheline alignment for each context token.
|
||||||
|
// [parallel_work_item_num, max_seq_len_padded]
|
||||||
|
|
||||||
|
#pragma omp parallel for collapse(2) schedule(dynamic, 1)
|
||||||
|
for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
|
||||||
|
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
|
||||||
|
int seq_len = seq_lens[seq_idx];
|
||||||
|
const int* seq_block_table =
|
||||||
|
block_tables + max_num_blocks_per_seq * seq_idx;
|
||||||
|
const int block_num = (seq_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||||
|
const int64_t kv_head_idx = head_idx / num_queries_per_kv;
|
||||||
|
const scalar_t* __restrict__ q_vec_ptr =
|
||||||
|
q + seq_idx * q_stride + head_idx * HEAD_SIZE;
|
||||||
|
const int last_block_token_num = seq_len - (block_num - 1) * BLOCK_SIZE;
|
||||||
|
float* __restrict__ thread_block_logits =
|
||||||
|
logits + omp_get_thread_num() * max_seq_len_padded;
|
||||||
|
|
||||||
|
// Compute logits
|
||||||
|
for (int block_idx = 0; block_idx < block_num; ++block_idx) {
|
||||||
|
const int64_t physical_block_idx = seq_block_table[block_idx];
|
||||||
|
const scalar_t* __restrict__ k_block_cache_ptr =
|
||||||
|
k_cache + physical_block_idx * kv_block_stride +
|
||||||
|
kv_head_idx * kv_head_stride;
|
||||||
|
float* __restrict__ head_block_logits =
|
||||||
|
thread_block_logits + block_idx * BLOCK_SIZE;
|
||||||
|
|
||||||
|
reduceQKBlockKernel<scalar_t, HEAD_SIZE, BLOCK_SIZE, x>::call(
|
||||||
|
q_vec_ptr, k_block_cache_ptr, head_block_logits, scale,
|
||||||
|
block_idx == block_num - 1 ? last_block_token_num : BLOCK_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute softmax
|
||||||
|
if (alibi_slopes) {
|
||||||
|
reduceSoftmaxAlibi(thread_block_logits, seq_len,
|
||||||
|
block_num * BLOCK_SIZE, alibi_slopes[head_idx], 0,
|
||||||
|
seq_len);
|
||||||
|
} else {
|
||||||
|
reduceSoftmax(thread_block_logits, seq_len, block_num * BLOCK_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute value
|
||||||
|
constexpr int head_elem_num_per_partition = 16;
|
||||||
|
constexpr int head_partition_num =
|
||||||
|
HEAD_SIZE / head_elem_num_per_partition;
|
||||||
|
for (int head_part_idx = 0; head_part_idx < head_partition_num;
|
||||||
|
++head_part_idx) {
|
||||||
|
vec_op::FP32Vec16 accums[head_elem_num_per_partition];
|
||||||
|
scalar_t* __restrict__ out_ptr =
|
||||||
|
out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE +
|
||||||
|
head_part_idx * head_elem_num_per_partition;
|
||||||
|
for (int block_idx = 0; block_idx < block_num; ++block_idx) {
|
||||||
|
const int64_t physical_block_idx = seq_block_table[block_idx];
|
||||||
|
const float* __restrict__ prob_vec_ptr =
|
||||||
|
thread_block_logits + block_idx * BLOCK_SIZE;
|
||||||
|
const scalar_t* __restrict__ v_block_cache_ptr =
|
||||||
|
v_cache + physical_block_idx * kv_block_stride +
|
||||||
|
kv_head_idx * kv_head_stride +
|
||||||
|
BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
|
||||||
|
reduceValueBlock<scalar_t, HEAD_SIZE, BLOCK_SIZE,
|
||||||
|
head_elem_num_per_partition>(
|
||||||
|
prob_vec_ptr, v_block_cache_ptr, accums);
|
||||||
|
|
||||||
|
if (block_idx != block_num - 1) {
|
||||||
|
const int64_t next_physical_block_idx =
|
||||||
|
seq_block_table[block_idx + 1];
|
||||||
|
const scalar_t* __restrict__ next_v_block_cache_ptr =
|
||||||
|
v_cache + next_physical_block_idx * kv_block_stride +
|
||||||
|
kv_head_idx * kv_head_stride +
|
||||||
|
BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
|
||||||
|
vec_op::unroll_loop<int, head_elem_num_per_partition>(
|
||||||
|
[&](int head_elem_idx) {
|
||||||
|
if (head_elem_idx % 2 == 0) {
|
||||||
|
vec_op::prefetch(next_v_block_cache_ptr +
|
||||||
|
BLOCK_SIZE * head_elem_idx);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
vec_op::unroll_loop<int, head_elem_num_per_partition>(
|
||||||
|
[&](int head_elem_idx) {
|
||||||
|
float value = accums[head_elem_idx].reduce_sum();
|
||||||
|
vec_op::storeFP32(value, out_ptr + head_elem_idx);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::free(logits);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#define LAUNCH_V1_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE) \
|
||||||
|
paged_attention_v1_impl<T, HEAD_SIZE, BLOCK_SIZE>::call( \
|
||||||
|
out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, scale, \
|
||||||
|
block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \
|
||||||
|
alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, num_seqs, \
|
||||||
|
num_heads);
|
||||||
|
|
||||||
|
template <typename T, int BLOCK_SIZE>
|
||||||
|
void paged_attention_v1_impl_launcher(
|
||||||
|
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||||
|
torch::Tensor& value_cache, int num_kv_heads, float scale,
|
||||||
|
torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
|
||||||
|
const c10::optional<torch::Tensor>& alibi_slopes) {
|
||||||
|
int num_seqs = query.size(0);
|
||||||
|
int num_heads = query.size(1);
|
||||||
|
int head_size = query.size(2);
|
||||||
|
int max_num_blocks_per_seq = block_tables.size(1);
|
||||||
|
int q_stride = query.stride(0);
|
||||||
|
int kv_block_stride = key_cache.stride(0);
|
||||||
|
int kv_head_stride = key_cache.stride(1);
|
||||||
|
|
||||||
|
// NOTE: alibi_slopes is optional.
|
||||||
|
const float* alibi_slopes_ptr =
|
||||||
|
alibi_slopes
|
||||||
|
? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
|
||||||
|
: nullptr;
|
||||||
|
|
||||||
|
T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
|
||||||
|
T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
|
||||||
|
T* key_cache_ptr = reinterpret_cast<T*>(key_cache.data_ptr());
|
||||||
|
T* value_cache_ptr = reinterpret_cast<T*>(value_cache.data_ptr());
|
||||||
|
int* block_tables_ptr = block_tables.data_ptr<int>();
|
||||||
|
int* seq_lens_ptr = seq_lens.data_ptr<int>();
|
||||||
|
|
||||||
|
switch (head_size) {
|
||||||
|
case 64:
|
||||||
|
LAUNCH_V1_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
|
||||||
|
break;
|
||||||
|
case 80:
|
||||||
|
LAUNCH_V1_ATTENTION_KERNEL(T, 80, BLOCK_SIZE);
|
||||||
|
break;
|
||||||
|
case 96:
|
||||||
|
LAUNCH_V1_ATTENTION_KERNEL(T, 96, BLOCK_SIZE);
|
||||||
|
break;
|
||||||
|
case 112:
|
||||||
|
LAUNCH_V1_ATTENTION_KERNEL(T, 112, BLOCK_SIZE);
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
LAUNCH_V1_ATTENTION_KERNEL(T, 128, BLOCK_SIZE);
|
||||||
|
break;
|
||||||
|
case 192:
|
||||||
|
LAUNCH_V1_ATTENTION_KERNEL(T, 192, BLOCK_SIZE);
|
||||||
|
break;
|
||||||
|
case 256:
|
||||||
|
LAUNCH_V1_ATTENTION_KERNEL(T, 256, BLOCK_SIZE);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
TORCH_CHECK(false, "Unsupported head size: ", head_size);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define CALL_V1_KERNEL_LAUNCHER(T, BLOCK_SIZE) \
|
||||||
|
paged_attention_v1_impl_launcher<T, BLOCK_SIZE>( \
|
||||||
|
out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
|
||||||
|
seq_lens, max_seq_len, alibi_slopes);
|
||||||
|
|
||||||
|
#define CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(T) \
|
||||||
|
switch (block_size) { \
|
||||||
|
case 16: \
|
||||||
|
CALL_V1_KERNEL_LAUNCHER(T, 16); \
|
||||||
|
break; \
|
||||||
|
default: \
|
||||||
|
TORCH_CHECK(false, "Unsupported block size: ", block_size); \
|
||||||
|
break; \
|
||||||
|
}
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
void paged_attention_v1(
|
||||||
|
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||||
|
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
|
||||||
|
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
|
||||||
|
int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
|
||||||
|
const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
|
||||||
|
const int64_t blocksparse_local_blocks,
|
||||||
|
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||||
|
const int64_t blocksparse_head_sliding_step) {
|
||||||
|
TORCH_CHECK(kv_scale == 1.0f);
|
||||||
|
TORCH_CHECK(blocksparse_vert_stride <= 1,
|
||||||
|
"CPU backend does not support blocksparse attention yet.");
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v1_impl",
|
||||||
|
[&] {
|
||||||
|
CPU_KERNEL_GUARD_IN(paged_attention_v1_impl)
|
||||||
|
CALL_V1_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
|
||||||
|
CPU_KERNEL_GUARD_OUT(paged_attention_v1_impl)
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Paged attention v2
|
||||||
|
namespace {
|
||||||
|
template <typename scalar_t, int HEAD_SIZE, int BLOCK_SIZE, int PARTITION_SIZE>
|
||||||
|
struct paged_attention_v2_impl {
|
||||||
|
static void call(
|
||||||
|
scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
|
||||||
|
float* __restrict__ exp_sums, // [num_seqs, num_heads,
|
||||||
|
// max_num_partitions]
|
||||||
|
float* __restrict__ max_logits, // [num_seqs, num_heads,
|
||||||
|
// max_num_partitions]
|
||||||
|
scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
|
||||||
|
// max_num_partitions, head_size]
|
||||||
|
const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
|
||||||
|
const scalar_t* __restrict__ k_cache, // [num_blocks, num_kv_heads,
|
||||||
|
// head_size/x, block_size, x]
|
||||||
|
const scalar_t* __restrict__ v_cache, // [num_blocks, num_kv_heads,
|
||||||
|
// head_size, block_size]
|
||||||
|
const int num_kv_heads, const float scale,
|
||||||
|
const int* __restrict__ block_tables, // [num_seqs,
|
||||||
|
// max_num_blocks_per_seq]
|
||||||
|
const int* __restrict__ seq_lens, // [num_seqs]
|
||||||
|
const int max_num_blocks_per_seq,
|
||||||
|
const float* __restrict__ alibi_slopes, // [num_heads]
|
||||||
|
const int q_stride, const int kv_block_stride, const int kv_head_stride,
|
||||||
|
const int num_seqs, const int num_heads, const int max_num_partitions) {
|
||||||
|
constexpr int x = 16 / sizeof(scalar_t);
|
||||||
|
const int num_queries_per_kv = num_heads / num_kv_heads;
|
||||||
|
|
||||||
|
static_assert(BLOCK_SIZE == 16);
|
||||||
|
static_assert(PARTITION_SIZE * sizeof(float) % 64 == 0);
|
||||||
|
static_assert(PARTITION_SIZE % BLOCK_SIZE == 0);
|
||||||
|
|
||||||
|
#pragma omp parallel for collapse(3) schedule(static, 1)
|
||||||
|
for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
|
||||||
|
for (int partition_idx = 0; partition_idx < max_num_partitions;
|
||||||
|
++partition_idx) {
|
||||||
|
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
|
||||||
|
const int seq_len = seq_lens[seq_idx];
|
||||||
|
const int start_token_idx = partition_idx * PARTITION_SIZE;
|
||||||
|
|
||||||
|
if (start_token_idx >= seq_len) continue;
|
||||||
|
|
||||||
|
const int partition_num =
|
||||||
|
(seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
|
||||||
|
const bool no_reduce = (partition_num == 1);
|
||||||
|
const int token_num =
|
||||||
|
(std::min(seq_len, start_token_idx + PARTITION_SIZE) -
|
||||||
|
start_token_idx);
|
||||||
|
const int block_num = (token_num + BLOCK_SIZE - 1) / BLOCK_SIZE;
|
||||||
|
const int last_block_token_num =
|
||||||
|
token_num - (block_num - 1) * BLOCK_SIZE;
|
||||||
|
const int* seq_block_table = block_tables +
|
||||||
|
max_num_blocks_per_seq * seq_idx +
|
||||||
|
start_token_idx / BLOCK_SIZE;
|
||||||
|
const int64_t kv_head_idx = head_idx / num_queries_per_kv;
|
||||||
|
const scalar_t* __restrict__ q_vec_ptr =
|
||||||
|
q + seq_idx * q_stride + head_idx * HEAD_SIZE;
|
||||||
|
|
||||||
|
float logits[PARTITION_SIZE] __attribute__((aligned(64))) = {0};
|
||||||
|
|
||||||
|
// Compute logits
|
||||||
|
for (int block_idx = 0; block_idx < block_num; ++block_idx) {
|
||||||
|
const int64_t physical_block_idx = seq_block_table[block_idx];
|
||||||
|
const scalar_t* __restrict__ k_block_cache_ptr =
|
||||||
|
k_cache + physical_block_idx * kv_block_stride +
|
||||||
|
kv_head_idx * kv_head_stride;
|
||||||
|
float* __restrict__ head_block_logits =
|
||||||
|
logits + block_idx * BLOCK_SIZE;
|
||||||
|
|
||||||
|
reduceQKBlockKernel<scalar_t, HEAD_SIZE, BLOCK_SIZE, x>::call(
|
||||||
|
q_vec_ptr, k_block_cache_ptr, head_block_logits, scale,
|
||||||
|
block_idx == block_num - 1 ? last_block_token_num : BLOCK_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::pair<float, float> max_and_sum;
|
||||||
|
if (alibi_slopes) {
|
||||||
|
max_and_sum = reduceSoftmaxAlibi(
|
||||||
|
logits, token_num, block_num * BLOCK_SIZE,
|
||||||
|
alibi_slopes[head_idx], start_token_idx, seq_len);
|
||||||
|
} else {
|
||||||
|
max_and_sum =
|
||||||
|
reduceSoftmax(logits, token_num, block_num * BLOCK_SIZE);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto&& [max_logit, exp_sum] = max_and_sum;
|
||||||
|
|
||||||
|
scalar_t* __restrict__ output_buffer = nullptr;
|
||||||
|
if (!no_reduce) {
|
||||||
|
auto idx = seq_idx * num_heads * max_num_partitions +
|
||||||
|
head_idx * max_num_partitions + partition_idx;
|
||||||
|
max_logits[idx] = max_logit;
|
||||||
|
exp_sums[idx] = exp_sum;
|
||||||
|
output_buffer =
|
||||||
|
tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
|
||||||
|
head_idx * max_num_partitions * HEAD_SIZE +
|
||||||
|
partition_idx * HEAD_SIZE;
|
||||||
|
} else {
|
||||||
|
output_buffer =
|
||||||
|
out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Compute value
|
||||||
|
constexpr int head_elem_num_per_partition = 16;
|
||||||
|
constexpr int head_partition_num =
|
||||||
|
HEAD_SIZE / head_elem_num_per_partition;
|
||||||
|
for (int head_part_idx = 0; head_part_idx < head_partition_num;
|
||||||
|
++head_part_idx) {
|
||||||
|
vec_op::FP32Vec16 accums[head_elem_num_per_partition];
|
||||||
|
scalar_t* __restrict__ out_ptr =
|
||||||
|
output_buffer + head_part_idx * head_elem_num_per_partition;
|
||||||
|
for (int block_idx = 0; block_idx < block_num; ++block_idx) {
|
||||||
|
const int64_t physical_block_idx = seq_block_table[block_idx];
|
||||||
|
const float* __restrict__ prob_vec_ptr =
|
||||||
|
logits + block_idx * BLOCK_SIZE;
|
||||||
|
const scalar_t* __restrict__ v_block_cache_ptr =
|
||||||
|
v_cache + physical_block_idx * kv_block_stride +
|
||||||
|
kv_head_idx * kv_head_stride +
|
||||||
|
BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
|
||||||
|
reduceValueBlock<scalar_t, HEAD_SIZE, BLOCK_SIZE,
|
||||||
|
head_elem_num_per_partition>(
|
||||||
|
prob_vec_ptr, v_block_cache_ptr, accums);
|
||||||
|
|
||||||
|
if (block_idx != block_num - 1) {
|
||||||
|
const int64_t next_physical_block_idx =
|
||||||
|
seq_block_table[block_idx + 1];
|
||||||
|
const scalar_t* __restrict__ next_v_block_cache_ptr =
|
||||||
|
v_cache + next_physical_block_idx * kv_block_stride +
|
||||||
|
kv_head_idx * kv_head_stride +
|
||||||
|
BLOCK_SIZE * head_part_idx * head_elem_num_per_partition;
|
||||||
|
vec_op::unroll_loop<int, head_elem_num_per_partition>(
|
||||||
|
[&](int head_elem_idx) {
|
||||||
|
if (head_elem_idx % 2 == 0) {
|
||||||
|
vec_op::prefetch(next_v_block_cache_ptr +
|
||||||
|
BLOCK_SIZE * head_elem_idx);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
vec_op::unroll_loop<int, head_elem_num_per_partition>(
|
||||||
|
[&](int head_elem_idx) {
|
||||||
|
float value = accums[head_elem_idx].reduce_sum();
|
||||||
|
vec_op::storeFP32(value, out_ptr + head_elem_idx);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Rescale partition softmax and store the factors to exp_sums
|
||||||
|
#pragma omp parallel for collapse(2) schedule(static, 1)
|
||||||
|
for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
|
||||||
|
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
|
||||||
|
const int seq_len = seq_lens[seq_idx];
|
||||||
|
const int partition_num =
|
||||||
|
(seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
|
||||||
|
|
||||||
|
if (partition_num == 1) continue;
|
||||||
|
|
||||||
|
reducePartitonSoftmax(
|
||||||
|
max_logits + seq_idx * num_heads * max_num_partitions +
|
||||||
|
head_idx * max_num_partitions,
|
||||||
|
exp_sums + seq_idx * num_heads * max_num_partitions +
|
||||||
|
head_idx * max_num_partitions,
|
||||||
|
partition_num);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Reduce values
|
||||||
|
using v_load_vec_type = typename KernelVecType<scalar_t>::v_load_vec_type;
|
||||||
|
static_assert(v_load_vec_type::get_elem_num() == BLOCK_SIZE);
|
||||||
|
constexpr int head_elem_num_per_group =
|
||||||
|
16; // Note: didn't align with the cacheline size, due to some
|
||||||
|
// HEAD_SIZE didn't align with 64 bytes
|
||||||
|
static_assert(HEAD_SIZE % head_elem_num_per_group == 0);
|
||||||
|
constexpr int head_group_num = HEAD_SIZE / head_elem_num_per_group;
|
||||||
|
const float* __restrict__ rescale_factors = exp_sums;
|
||||||
|
#pragma omp parallel for collapse(3) schedule(static, 1)
|
||||||
|
for (int seq_idx = 0; seq_idx < num_seqs; ++seq_idx) {
|
||||||
|
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
|
||||||
|
for (int group_idx = 0; group_idx < head_group_num; ++group_idx) {
|
||||||
|
const int seq_len = seq_lens[seq_idx];
|
||||||
|
const int partition_num =
|
||||||
|
(seq_len + PARTITION_SIZE - 1) / PARTITION_SIZE;
|
||||||
|
|
||||||
|
if (partition_num == 1) continue;
|
||||||
|
|
||||||
|
const float* __restrict__ seq_head_rescale_factors =
|
||||||
|
rescale_factors + seq_idx * num_heads * max_num_partitions +
|
||||||
|
head_idx * max_num_partitions;
|
||||||
|
const scalar_t* __restrict__ seq_head_tmp_out =
|
||||||
|
tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
|
||||||
|
head_idx * max_num_partitions * HEAD_SIZE +
|
||||||
|
group_idx * head_elem_num_per_group;
|
||||||
|
scalar_t* __restrict__ seq_head_output =
|
||||||
|
out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE +
|
||||||
|
group_idx * head_elem_num_per_group;
|
||||||
|
|
||||||
|
vec_op::FP32Vec16 acc;
|
||||||
|
for (int i = 0; i < partition_num; ++i) {
|
||||||
|
vec_op::FP32Vec16 rescale_factor(seq_head_rescale_factors[i]);
|
||||||
|
v_load_vec_type value(seq_head_tmp_out + i * HEAD_SIZE);
|
||||||
|
vec_op::FP32Vec16 fp32_value(value);
|
||||||
|
acc = acc + fp32_value * rescale_factor;
|
||||||
|
}
|
||||||
|
v_load_vec_type cast_acc(acc);
|
||||||
|
cast_acc.save(seq_head_output);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#define LAUNCH_V2_ATTENTION_KERNEL(T, HEAD_SIZE, BLOCK_SIZE) \
|
||||||
|
paged_attention_v2_impl<T, HEAD_SIZE, BLOCK_SIZE, PARTITION_SIZE>::call( \
|
||||||
|
out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, \
|
||||||
|
key_cache_ptr, value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \
|
||||||
|
seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \
|
||||||
|
kv_block_stride, kv_head_stride, num_seqs, num_heads, \
|
||||||
|
max_num_partitions);
|
||||||
|
|
||||||
|
template <typename T, int BLOCK_SIZE, int PARTITION_SIZE = 512>
|
||||||
|
void paged_attention_v2_impl_launcher(
|
||||||
|
torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
|
||||||
|
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||||
|
torch::Tensor& value_cache, int num_kv_heads, float scale,
|
||||||
|
torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
|
||||||
|
int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes) {
|
||||||
|
int num_seqs = query.size(0);
|
||||||
|
int num_heads = query.size(1);
|
||||||
|
int head_size = query.size(2);
|
||||||
|
int max_num_blocks_per_seq = block_tables.size(1);
|
||||||
|
int q_stride = query.stride(0);
|
||||||
|
int kv_block_stride = key_cache.stride(0);
|
||||||
|
int kv_head_stride = key_cache.stride(1);
|
||||||
|
int max_num_partitions = exp_sums.size(-1);
|
||||||
|
|
||||||
|
// NOTE: alibi_slopes is optional.
|
||||||
|
const float* alibi_slopes_ptr =
|
||||||
|
alibi_slopes
|
||||||
|
? reinterpret_cast<const float*>(alibi_slopes.value().data_ptr())
|
||||||
|
: nullptr;
|
||||||
|
|
||||||
|
T* out_ptr = reinterpret_cast<T*>(out.data_ptr());
|
||||||
|
float* exp_sums_ptr = reinterpret_cast<float*>(exp_sums.data_ptr());
|
||||||
|
float* max_logits_ptr = reinterpret_cast<float*>(max_logits.data_ptr());
|
||||||
|
T* tmp_out_ptr = reinterpret_cast<T*>(tmp_out.data_ptr());
|
||||||
|
T* query_ptr = reinterpret_cast<T*>(query.data_ptr());
|
||||||
|
T* key_cache_ptr = reinterpret_cast<T*>(key_cache.data_ptr());
|
||||||
|
T* value_cache_ptr = reinterpret_cast<T*>(value_cache.data_ptr());
|
||||||
|
int* block_tables_ptr = block_tables.data_ptr<int>();
|
||||||
|
int* seq_lens_ptr = seq_lens.data_ptr<int>();
|
||||||
|
|
||||||
|
switch (head_size) {
|
||||||
|
case 64:
|
||||||
|
LAUNCH_V2_ATTENTION_KERNEL(T, 64, BLOCK_SIZE);
|
||||||
|
break;
|
||||||
|
case 80:
|
||||||
|
LAUNCH_V2_ATTENTION_KERNEL(T, 80, BLOCK_SIZE);
|
||||||
|
break;
|
||||||
|
case 96:
|
||||||
|
LAUNCH_V2_ATTENTION_KERNEL(T, 96, BLOCK_SIZE);
|
||||||
|
break;
|
||||||
|
case 112:
|
||||||
|
LAUNCH_V2_ATTENTION_KERNEL(T, 112, BLOCK_SIZE);
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
LAUNCH_V2_ATTENTION_KERNEL(T, 128, BLOCK_SIZE);
|
||||||
|
break;
|
||||||
|
case 192:
|
||||||
|
LAUNCH_V2_ATTENTION_KERNEL(T, 192, BLOCK_SIZE);
|
||||||
|
break;
|
||||||
|
case 256:
|
||||||
|
LAUNCH_V2_ATTENTION_KERNEL(T, 256, BLOCK_SIZE);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
TORCH_CHECK(false, "Unsupported head size: ", head_size);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define CALL_V2_KERNEL_LAUNCHER(T, BLOCK_SIZE) \
|
||||||
|
paged_attention_v2_impl_launcher<T, BLOCK_SIZE>( \
|
||||||
|
out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \
|
||||||
|
num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len, \
|
||||||
|
alibi_slopes);
|
||||||
|
|
||||||
|
#define CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(T) \
|
||||||
|
switch (block_size) { \
|
||||||
|
case 16: \
|
||||||
|
CALL_V2_KERNEL_LAUNCHER(T, 16); \
|
||||||
|
break; \
|
||||||
|
default: \
|
||||||
|
TORCH_CHECK(false, "Unsupported block size: ", block_size); \
|
||||||
|
break; \
|
||||||
|
}
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
void paged_attention_v2(
|
||||||
|
torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
|
||||||
|
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||||
|
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
|
||||||
|
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
|
||||||
|
int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
|
||||||
|
const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
|
||||||
|
const int64_t blocksparse_local_blocks,
|
||||||
|
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||||
|
const int64_t blocksparse_head_sliding_step) {
|
||||||
|
TORCH_CHECK(kv_scale == 1.0f);
|
||||||
|
TORCH_CHECK(blocksparse_vert_stride <= 1,
|
||||||
|
"CPU backend does not support blocksparse attention yet.");
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "paged_attention_v2_impl",
|
||||||
|
[&] {
|
||||||
|
CPU_KERNEL_GUARD_IN(paged_attention_v2_impl)
|
||||||
|
CALL_V2_KERNEL_LAUNCHER_BLOCK_SIZE(scalar_t);
|
||||||
|
CPU_KERNEL_GUARD_OUT(paged_attention_v2_impl)
|
||||||
|
});
|
||||||
|
}
|
||||||
137
csrc/cpu/cache.cpp
Normal file
137
csrc/cpu/cache.cpp
Normal file
@ -0,0 +1,137 @@
|
|||||||
|
#include <map>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "cpu_types.hpp"
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
template <typename scalar_t>
|
||||||
|
void copy_blocks_cpu_impl(std::vector<torch::Tensor> const& key_caches,
|
||||||
|
std::vector<torch::Tensor> const& value_caches,
|
||||||
|
const torch::Tensor& mapping_pairs,
|
||||||
|
const int element_num_per_block,
|
||||||
|
const int layer_num) {
|
||||||
|
const size_t pair_num = mapping_pairs.size(0);
|
||||||
|
const size_t block_bytes = sizeof(scalar_t) * element_num_per_block;
|
||||||
|
#pragma omp parallel for collapse(2)
|
||||||
|
for (int layer = 0; layer < layer_num; ++layer) {
|
||||||
|
for (size_t pair = 0; pair < pair_num; ++pair) {
|
||||||
|
int64_t source_offset =
|
||||||
|
element_num_per_block * mapping_pairs[pair][0].item<int64_t>();
|
||||||
|
int64_t target_offset =
|
||||||
|
element_num_per_block * mapping_pairs[pair][1].item<int64_t>();
|
||||||
|
scalar_t* key_cache_ptr = key_caches[layer].data_ptr<scalar_t>();
|
||||||
|
scalar_t* source_ptr = key_cache_ptr + source_offset;
|
||||||
|
scalar_t* target_ptr = key_cache_ptr + target_offset;
|
||||||
|
std::memcpy(target_ptr, source_ptr, block_bytes);
|
||||||
|
|
||||||
|
scalar_t* value_cache_ptr = value_caches[layer].data_ptr<scalar_t>();
|
||||||
|
source_ptr = value_cache_ptr + source_offset;
|
||||||
|
target_ptr = value_cache_ptr + target_offset;
|
||||||
|
std::memcpy(target_ptr, source_ptr, block_bytes);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename scalar_t>
|
||||||
|
void reshape_and_cache_cpu_impl(
|
||||||
|
const scalar_t* __restrict__ key, const scalar_t* __restrict__ value,
|
||||||
|
scalar_t* __restrict__ key_cache, scalar_t* __restrict__ value_cache,
|
||||||
|
const int64_t* __restrict__ slot_mapping, const int num_tokens,
|
||||||
|
const int key_stride, const int value_stride, const int num_heads,
|
||||||
|
const int head_size, const int block_size, const int x) {
|
||||||
|
const int block_elem_num = num_heads * head_size * block_size;
|
||||||
|
|
||||||
|
#pragma omp parallel for collapse(2)
|
||||||
|
for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
|
||||||
|
for (int head_idx = 0; head_idx < num_heads; ++head_idx) {
|
||||||
|
const int64_t slot_idx = slot_mapping[token_idx];
|
||||||
|
if (slot_idx >= 0) {
|
||||||
|
int src_key_head_idx = token_idx * key_stride + head_idx * head_size;
|
||||||
|
int src_value_head_idx =
|
||||||
|
token_idx * value_stride + head_idx * head_size;
|
||||||
|
const scalar_t* src_key_head_ptr = key + src_key_head_idx;
|
||||||
|
const scalar_t* src_value_head_ptr = value + src_value_head_idx;
|
||||||
|
const int64_t block_index = slot_idx / block_size;
|
||||||
|
const int64_t block_offset = slot_idx % block_size;
|
||||||
|
scalar_t* target_key_head_ptr = key_cache +
|
||||||
|
block_elem_num * block_index +
|
||||||
|
head_idx * block_size * head_size;
|
||||||
|
scalar_t* target_value_head_ptr = value_cache +
|
||||||
|
block_elem_num * block_index +
|
||||||
|
head_idx * block_size * head_size;
|
||||||
|
|
||||||
|
for (int src_key_idx = 0; src_key_idx < head_size; src_key_idx += x) {
|
||||||
|
const int64_t target_offset =
|
||||||
|
src_key_idx * block_size + block_offset * x;
|
||||||
|
for (int i = 0; i < x; ++i) {
|
||||||
|
target_key_head_ptr[target_offset + i] =
|
||||||
|
src_key_head_ptr[src_key_idx + i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int src_value_idx = 0; src_value_idx < head_size;
|
||||||
|
++src_value_idx) {
|
||||||
|
const int64_t target_offset =
|
||||||
|
src_value_idx * block_size + block_offset;
|
||||||
|
target_value_head_ptr[target_offset] =
|
||||||
|
src_value_head_ptr[src_value_idx];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}; // namespace
|
||||||
|
|
||||||
|
// Note: the key_caches and value_caches vectors are constant but
|
||||||
|
// not the Tensors they contain. The vectors need to be const refs
|
||||||
|
// in order to satisfy pytorch's C++ operator registration code.
|
||||||
|
void copy_blocks(std::vector<torch::Tensor> const& key_caches,
|
||||||
|
std::vector<torch::Tensor> const& value_caches,
|
||||||
|
const torch::Tensor& block_mapping) {
|
||||||
|
unsigned num_layers = key_caches.size();
|
||||||
|
TORCH_CHECK(num_layers == value_caches.size());
|
||||||
|
if (num_layers == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int element_num_per_block = key_caches[0][0].numel();
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(
|
||||||
|
key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] {
|
||||||
|
CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl)
|
||||||
|
copy_blocks_cpu_impl<scalar_t>(key_caches, value_caches, block_mapping,
|
||||||
|
element_num_per_block, num_layers);
|
||||||
|
CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl)
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
|
||||||
|
torch::Tensor& key_cache, torch::Tensor& value_cache,
|
||||||
|
torch::Tensor& slot_mapping,
|
||||||
|
const std::string& kv_cache_dtype, double kv_scale) {
|
||||||
|
TORCH_CHECK(kv_scale == 1.0f);
|
||||||
|
|
||||||
|
int num_tokens = key.size(0);
|
||||||
|
int num_heads = key.size(1);
|
||||||
|
int head_size = key.size(2);
|
||||||
|
int block_size = key_cache.size(3);
|
||||||
|
int x = key_cache.size(4);
|
||||||
|
|
||||||
|
int key_stride = key.stride(0);
|
||||||
|
int value_stride = value.stride(0);
|
||||||
|
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(
|
||||||
|
key.scalar_type(), "reshape_and_cache_cpu_impl", [&] {
|
||||||
|
CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl)
|
||||||
|
reshape_and_cache_cpu_impl<scalar_t>(
|
||||||
|
key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
|
||||||
|
key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
|
||||||
|
slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride,
|
||||||
|
value_stride, num_heads, head_size, block_size, x);
|
||||||
|
CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl)
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
|
||||||
|
const torch::Tensor& block_mapping) {
|
||||||
|
TORCH_CHECK(false, "swap_blocks is unsupported on CPU.")
|
||||||
|
}
|
||||||
352
csrc/cpu/cpu_types.hpp
Normal file
352
csrc/cpu/cpu_types.hpp
Normal file
@ -0,0 +1,352 @@
|
|||||||
|
|
||||||
|
#ifndef CPU_TYPES_HPP
|
||||||
|
#define CPU_TYPES_HPP
|
||||||
|
|
||||||
|
#include <immintrin.h>
|
||||||
|
#include <torch/all.h>
|
||||||
|
|
||||||
|
namespace vec_op {
|
||||||
|
|
||||||
|
// FIXME: FP16 is not fully supported in Torch-CPU
|
||||||
|
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
|
||||||
|
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
||||||
|
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
|
||||||
|
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
|
||||||
|
|
||||||
|
#ifndef CPU_OP_GUARD
|
||||||
|
#define CPU_KERNEL_GUARD_IN(NAME)
|
||||||
|
#define CPU_KERNEL_GUARD_OUT(NAME)
|
||||||
|
#else
|
||||||
|
#define CPU_KERNEL_GUARD_IN(NAME) \
|
||||||
|
std::cout << #NAME << " invoked." << std::endl;
|
||||||
|
#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define FORCE_INLINE __attribute__((always_inline)) inline
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
template <typename T, T... indexes, typename F>
|
||||||
|
constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F &&f) {
|
||||||
|
(f(std::integral_constant<T, indexes>{}), ...);
|
||||||
|
}
|
||||||
|
}; // namespace
|
||||||
|
|
||||||
|
template <typename T, T count, typename F,
|
||||||
|
typename = std::enable_if_t<std::is_invocable_v<F, T>>>
|
||||||
|
constexpr void unroll_loop(F &&f) {
|
||||||
|
unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T> struct Vec {
|
||||||
|
constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
|
||||||
|
};
|
||||||
|
|
||||||
|
struct FP32Vec8;
|
||||||
|
struct FP32Vec16;
|
||||||
|
|
||||||
|
#ifdef __AVX512FP16__
|
||||||
|
struct FP16Vec8 : public Vec<FP16Vec8> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 8;
|
||||||
|
|
||||||
|
__m128h reg;
|
||||||
|
|
||||||
|
explicit FP16Vec8(_Float16 v) : reg(_mm_set1_ph(v)) {}
|
||||||
|
|
||||||
|
explicit FP16Vec8(const void *ptr) : reg(_mm_loadu_ph(ptr)) {}
|
||||||
|
|
||||||
|
explicit FP16Vec8(__m128h data) : reg(data) {}
|
||||||
|
|
||||||
|
FP16Vec8 operator*(const FP16Vec8 &b) const {
|
||||||
|
return FP16Vec8(_mm_mul_ph(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP16Vec8 operator+(const FP16Vec8 &b) const {
|
||||||
|
return FP16Vec8(_mm_add_ph(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP16Vec8 operator-(const FP16Vec8 &b) const {
|
||||||
|
return FP16Vec8(_mm_sub_ph(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP16Vec8 operator/(const FP16Vec8 &b) const {
|
||||||
|
return FP16Vec8(_mm_div_ph(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
void save(void *ptr) const { _mm_storeu_ph(ptr, reg); }
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
struct BF16Vec8 : public Vec<BF16Vec8> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 8;
|
||||||
|
|
||||||
|
__m128i reg;
|
||||||
|
|
||||||
|
explicit BF16Vec8(const void *ptr)
|
||||||
|
: reg((__m128i)_mm_loadu_si128((__m128i *)ptr)) {}
|
||||||
|
|
||||||
|
explicit BF16Vec8(const FP32Vec8 &);
|
||||||
|
|
||||||
|
void save(void *ptr) const { *reinterpret_cast<__m128i *>(ptr) = reg; }
|
||||||
|
};
|
||||||
|
|
||||||
|
struct BF16Vec16 : public Vec<BF16Vec16> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 16;
|
||||||
|
|
||||||
|
__m256i reg;
|
||||||
|
|
||||||
|
explicit BF16Vec16(const void *ptr)
|
||||||
|
: reg((__m256i)_mm256_loadu_si256((__m256i *)ptr)) {}
|
||||||
|
|
||||||
|
explicit BF16Vec16(const FP32Vec16 &);
|
||||||
|
|
||||||
|
void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
|
||||||
|
};
|
||||||
|
|
||||||
|
struct BF16Vec32 : public Vec<BF16Vec32> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 32;
|
||||||
|
|
||||||
|
__m512i reg;
|
||||||
|
|
||||||
|
explicit BF16Vec32(const void *ptr) : reg((__m512i)_mm512_loadu_si512(ptr)) {}
|
||||||
|
|
||||||
|
explicit BF16Vec32(__m512i data) : reg(data) {}
|
||||||
|
|
||||||
|
explicit BF16Vec32(BF16Vec8 &vec8_data)
|
||||||
|
: reg((__m512i)_mm512_inserti32x4(
|
||||||
|
_mm512_inserti32x4(_mm512_inserti32x4(_mm512_castsi128_si512(
|
||||||
|
(__m128i)vec8_data.reg),
|
||||||
|
(__m128i)vec8_data.reg, 1),
|
||||||
|
(__m128i)vec8_data.reg, 2),
|
||||||
|
(__m128i)vec8_data.reg, 3)) {}
|
||||||
|
|
||||||
|
void save(void *ptr) const { *reinterpret_cast<__m512i *>(ptr) = reg; }
|
||||||
|
};
|
||||||
|
|
||||||
|
struct FP32Vec4 : public Vec<FP32Vec4> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 4;
|
||||||
|
union AliasReg {
|
||||||
|
__m128 reg;
|
||||||
|
float values[VEC_ELEM_NUM];
|
||||||
|
};
|
||||||
|
|
||||||
|
__m128 reg;
|
||||||
|
|
||||||
|
explicit FP32Vec4(float v) : reg(_mm_set1_ps(v)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec4() : reg(_mm_set1_ps(0.0)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec4(const float *ptr) : reg(_mm_loadu_ps(ptr)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec4(__m128 data) : reg(data) {}
|
||||||
|
|
||||||
|
explicit FP32Vec4(const FP32Vec4 &data) : reg(data.reg) {}
|
||||||
|
};
|
||||||
|
|
||||||
|
struct FP32Vec8 : public Vec<FP32Vec8> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 8;
|
||||||
|
union AliasReg {
|
||||||
|
__m256 reg;
|
||||||
|
float values[VEC_ELEM_NUM];
|
||||||
|
};
|
||||||
|
|
||||||
|
__m256 reg;
|
||||||
|
|
||||||
|
explicit FP32Vec8(float v) : reg(_mm256_set1_ps(v)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec8() : reg(_mm256_set1_ps(0.0)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec8(const float *ptr) : reg(_mm256_loadu_ps(ptr)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec8(__m256 data) : reg(data) {}
|
||||||
|
|
||||||
|
explicit FP32Vec8(const FP32Vec8 &data) : reg(data.reg) {}
|
||||||
|
|
||||||
|
#ifdef __AVX512FP16__
|
||||||
|
explicit FP32Vec8(__m128h v) : reg(_mm256_cvtph_ps(_mm_castph_si128(v))) {}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
explicit FP32Vec8(const BF16Vec8 &v)
|
||||||
|
: reg(_mm256_castsi256_ps(
|
||||||
|
_mm256_bslli_epi128(_mm256_cvtepu16_epi32(v.reg), 2))) {}
|
||||||
|
|
||||||
|
float reduce_sum() const {
|
||||||
|
AliasReg ar;
|
||||||
|
ar.reg = reg;
|
||||||
|
float result = 0;
|
||||||
|
unroll_loop<int, VEC_ELEM_NUM>([&result, &ar](int i) { result += ar.values[i]; });
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 exp() const {
|
||||||
|
AliasReg ar;
|
||||||
|
ar.reg = reg;
|
||||||
|
return FP32Vec8(_mm256_set_ps(expf(ar.values[7]), expf(ar.values[6]),
|
||||||
|
expf(ar.values[5]), expf(ar.values[4]),
|
||||||
|
expf(ar.values[3]), expf(ar.values[2]),
|
||||||
|
expf(ar.values[1]), expf(ar.values[0])));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 tanh() const {
|
||||||
|
AliasReg ar;
|
||||||
|
ar.reg = reg;
|
||||||
|
return FP32Vec8(_mm256_set_ps(tanhf(ar.values[7]), tanhf(ar.values[6]),
|
||||||
|
tanhf(ar.values[5]), tanhf(ar.values[4]),
|
||||||
|
tanhf(ar.values[3]), tanhf(ar.values[2]),
|
||||||
|
tanhf(ar.values[1]), tanhf(ar.values[0])));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 er() const {
|
||||||
|
AliasReg ar;
|
||||||
|
ar.reg = reg;
|
||||||
|
return FP32Vec8(_mm256_set_ps(erf(ar.values[7]), erf(ar.values[6]),
|
||||||
|
erf(ar.values[5]), erf(ar.values[4]),
|
||||||
|
erf(ar.values[3]), erf(ar.values[2]),
|
||||||
|
erf(ar.values[1]), erf(ar.values[0])));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 operator*(const FP32Vec8 &b) const {
|
||||||
|
return FP32Vec8(_mm256_mul_ps(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 operator+(const FP32Vec8 &b) const {
|
||||||
|
return FP32Vec8(_mm256_add_ps(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 operator-(const FP32Vec8 &b) const {
|
||||||
|
return FP32Vec8(_mm256_sub_ps(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec8 operator/(const FP32Vec8 &b) const {
|
||||||
|
return FP32Vec8(_mm256_div_ps(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); }
|
||||||
|
};
|
||||||
|
|
||||||
|
struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||||
|
constexpr static int VEC_ELEM_NUM = 16;
|
||||||
|
union AliasReg {
|
||||||
|
__m512 reg;
|
||||||
|
float values[VEC_ELEM_NUM];
|
||||||
|
};
|
||||||
|
|
||||||
|
__m512 reg;
|
||||||
|
|
||||||
|
explicit FP32Vec16(float v) : reg(_mm512_set1_ps(v)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16() : reg(_mm512_set1_ps(0.0)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const float *ptr) : reg(_mm512_loadu_ps(ptr)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(__m512 data) : reg(data) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const FP32Vec4 &data)
|
||||||
|
: reg((__m512)_mm512_inserti32x4(
|
||||||
|
_mm512_inserti32x4(
|
||||||
|
_mm512_inserti32x4(_mm512_castsi128_si512((__m128i)data.reg),
|
||||||
|
(__m128i)data.reg, 1),
|
||||||
|
(__m128i)data.reg, 2),
|
||||||
|
(__m128i)data.reg, 3)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const FP32Vec8 &data)
|
||||||
|
: reg((__m512)_mm512_inserti32x8(
|
||||||
|
_mm512_castsi256_si512((__m256i)data.reg), (__m256i)data.reg, 1)) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const BF16Vec16 &v)
|
||||||
|
: reg(_mm512_castsi512_ps(
|
||||||
|
_mm512_bslli_epi128(_mm512_cvtepu16_epi32(v.reg), 2))) {}
|
||||||
|
|
||||||
|
explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
|
||||||
|
|
||||||
|
FP32Vec16 operator*(const FP32Vec16 &b) const {
|
||||||
|
return FP32Vec16(_mm512_mul_ps(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 operator+(const FP32Vec16 &b) const {
|
||||||
|
return FP32Vec16(_mm512_add_ps(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 operator-(const FP32Vec16 &b) const {
|
||||||
|
return FP32Vec16(_mm512_sub_ps(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
FP32Vec16 operator/(const FP32Vec16 &b) const {
|
||||||
|
return FP32Vec16(_mm512_div_ps(reg, b.reg));
|
||||||
|
}
|
||||||
|
|
||||||
|
float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
|
||||||
|
|
||||||
|
template <int group_size> float reduce_sub_sum(int idx) {
|
||||||
|
static_assert(VEC_ELEM_NUM % group_size == 0);
|
||||||
|
constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
|
||||||
|
__mmask16 mask = _cvtu32_mask16(base_mask << (idx * group_size));
|
||||||
|
return _mm512_mask_reduce_add_ps(mask, reg);
|
||||||
|
}
|
||||||
|
|
||||||
|
void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
|
||||||
|
};
|
||||||
|
|
||||||
|
template <typename T> struct VecType { using vec_type = void; };
|
||||||
|
|
||||||
|
template <typename T> using vec_t = typename VecType<T>::vec_type;
|
||||||
|
|
||||||
|
template <> struct VecType<float> { using vec_type = FP32Vec8; };
|
||||||
|
|
||||||
|
#ifdef __AVX512FP16__
|
||||||
|
template <> struct VecType<c10::Half> { using vec_type = FP16Vec16; };
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template <> struct VecType<c10::BFloat16> { using vec_type = BF16Vec8; };
|
||||||
|
|
||||||
|
template <typename T> void storeFP32(float v, T *ptr) { *ptr = v; }
|
||||||
|
|
||||||
|
#ifdef __AVX512FP16__
|
||||||
|
template <> inline void storeFP32<c10::Half>(float v, c10::Half *ptr) {
|
||||||
|
*reinterpret_cast<_Float16 *>(ptr) = v;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
inline void fma(FP32Vec16 &acc, FP32Vec16 &a, FP32Vec16 &b) {
|
||||||
|
acc = acc + a * b;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef __AVX512BF16__
|
||||||
|
template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
|
||||||
|
*reinterpret_cast<__bfloat16 *>(ptr) = _mm_cvtness_sbh(v);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
|
||||||
|
: reg((__m128i)_mm256_cvtneps_pbh(v.reg)) {}
|
||||||
|
|
||||||
|
inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
|
||||||
|
: reg((__m256i)_mm512_cvtneps_pbh(v.reg)) {}
|
||||||
|
|
||||||
|
inline void fma(FP32Vec16 &acc, BF16Vec32 &a, BF16Vec32 &b) {
|
||||||
|
acc.reg = _mm512_dpbf16_ps(acc.reg, (__m512bh)a.reg, (__m512bh)b.reg);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
template <> inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16 *ptr) {
|
||||||
|
c10::BFloat16 __attribute__((__may_alias__)) *v_ptr =
|
||||||
|
reinterpret_cast<c10::BFloat16 *>(&v);
|
||||||
|
*ptr = *(v_ptr + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline BF16Vec8::BF16Vec8(const FP32Vec8 &v)
|
||||||
|
: reg(_mm256_cvtepi32_epi16(
|
||||||
|
_mm256_bsrli_epi128(_mm256_castps_si256(v.reg), 2))) {}
|
||||||
|
|
||||||
|
inline BF16Vec16::BF16Vec16(const FP32Vec16 &v)
|
||||||
|
: reg(_mm512_cvtepi32_epi16(
|
||||||
|
_mm512_bsrli_epi128(_mm512_castps_si512(v.reg), 2))) {}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
inline void prefetch(const void *addr) { _mm_prefetch(addr, _MM_HINT_T1); }
|
||||||
|
|
||||||
|
}; // namespace vec_op
|
||||||
|
|
||||||
|
#endif
|
||||||
117
csrc/cpu/layernorm.cpp
Normal file
117
csrc/cpu/layernorm.cpp
Normal file
@ -0,0 +1,117 @@
|
|||||||
|
#include "cpu_types.hpp"
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
template <typename scalar_t>
|
||||||
|
void rms_norm_impl(scalar_t* __restrict__ out,
|
||||||
|
const scalar_t* __restrict__ input,
|
||||||
|
const scalar_t* __restrict__ weight, const float epsilon,
|
||||||
|
const int num_tokens, const int hidden_size) {
|
||||||
|
using scalar_vec_t = vec_op::vec_t<scalar_t>;
|
||||||
|
constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
|
||||||
|
TORCH_CHECK(hidden_size % VEC_ELEM_NUM == 0);
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (int i = 0; i < num_tokens; ++i) {
|
||||||
|
vec_op::FP32Vec8 variance(0.0);
|
||||||
|
auto input_p = input + i * hidden_size;
|
||||||
|
auto output_p = out + i * hidden_size;
|
||||||
|
for (int j = 0; j < hidden_size; j += VEC_ELEM_NUM) {
|
||||||
|
scalar_vec_t x(input_p + j);
|
||||||
|
vec_op::FP32Vec8 fp32_x(x);
|
||||||
|
variance = variance + fp32_x * fp32_x;
|
||||||
|
}
|
||||||
|
|
||||||
|
float s_variance =
|
||||||
|
1.0f / sqrtf(variance.reduce_sum() / (float)hidden_size + epsilon);
|
||||||
|
vec_op::FP32Vec8 fp32_s_variance(s_variance);
|
||||||
|
|
||||||
|
for (int j = 0; j < hidden_size; j += VEC_ELEM_NUM) {
|
||||||
|
scalar_vec_t x(input_p + j);
|
||||||
|
scalar_vec_t w(weight + j);
|
||||||
|
|
||||||
|
vec_op::FP32Vec8 fp32_x(x);
|
||||||
|
vec_op::FP32Vec8 fp32_w(w);
|
||||||
|
|
||||||
|
vec_op::FP32Vec8 fp32_out = fp32_x * fp32_s_variance * fp32_w;
|
||||||
|
|
||||||
|
scalar_vec_t out(fp32_out);
|
||||||
|
out.save(output_p + j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename scalar_t>
|
||||||
|
void fused_add_rms_norm_impl(scalar_t* __restrict__ input,
|
||||||
|
scalar_t* __restrict__ residual,
|
||||||
|
const scalar_t* __restrict__ weight,
|
||||||
|
const float epsilon, const int num_tokens,
|
||||||
|
const int hidden_size) {
|
||||||
|
using scalar_vec_t = vec_op::vec_t<scalar_t>;
|
||||||
|
constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
|
||||||
|
TORCH_CHECK(hidden_size % VEC_ELEM_NUM == 0);
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (int i = 0; i < num_tokens; ++i) {
|
||||||
|
vec_op::FP32Vec8 variance(0.0);
|
||||||
|
auto input_p = input + i * hidden_size;
|
||||||
|
auto residual_p = residual + i * hidden_size;
|
||||||
|
for (int j = 0; j < hidden_size; j += VEC_ELEM_NUM) {
|
||||||
|
scalar_vec_t x(input_p + j);
|
||||||
|
scalar_vec_t res(residual_p + j);
|
||||||
|
vec_op::FP32Vec8 fp32_x(x);
|
||||||
|
vec_op::FP32Vec8 fp32_res(res);
|
||||||
|
|
||||||
|
fp32_x = fp32_x + fp32_res;
|
||||||
|
variance = variance + fp32_x * fp32_x;
|
||||||
|
scalar_vec_t out(fp32_x);
|
||||||
|
out.save(residual_p + j);
|
||||||
|
}
|
||||||
|
|
||||||
|
float s_variance =
|
||||||
|
1.0f / sqrtf(variance.reduce_sum() / (float)hidden_size + epsilon);
|
||||||
|
vec_op::FP32Vec8 fp32_s_variance(s_variance);
|
||||||
|
|
||||||
|
for (int j = 0; j < hidden_size; j += VEC_ELEM_NUM) {
|
||||||
|
scalar_vec_t w(weight + j);
|
||||||
|
scalar_vec_t res(residual_p + j);
|
||||||
|
|
||||||
|
vec_op::FP32Vec8 fp32_w(w);
|
||||||
|
vec_op::FP32Vec8 fp32_res(res);
|
||||||
|
|
||||||
|
vec_op::FP32Vec8 fp32_out = fp32_res * fp32_s_variance * fp32_w;
|
||||||
|
|
||||||
|
scalar_vec_t out(fp32_out);
|
||||||
|
out.save(input_p + j);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
|
||||||
|
double epsilon) {
|
||||||
|
int hidden_size = input.size(-1);
|
||||||
|
int num_tokens = input.numel() / hidden_size;
|
||||||
|
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_impl", [&] {
|
||||||
|
CPU_KERNEL_GUARD_IN(rms_norm_impl)
|
||||||
|
rms_norm_impl(out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
|
||||||
|
weight.data_ptr<scalar_t>(), epsilon, num_tokens,
|
||||||
|
hidden_size);
|
||||||
|
CPU_KERNEL_GUARD_OUT(rms_norm_impl)
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
|
||||||
|
torch::Tensor& weight, double epsilon) {
|
||||||
|
int hidden_size = input.size(-1);
|
||||||
|
int num_tokens = input.numel() / hidden_size;
|
||||||
|
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(
|
||||||
|
input.scalar_type(), "fused_add_rms_norm_impl", [&] {
|
||||||
|
CPU_KERNEL_GUARD_IN(fused_add_rms_norm_impl)
|
||||||
|
fused_add_rms_norm_impl(
|
||||||
|
input.data_ptr<scalar_t>(), residual.data_ptr<scalar_t>(),
|
||||||
|
weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
|
||||||
|
CPU_KERNEL_GUARD_OUT(fused_add_rms_norm_impl)
|
||||||
|
});
|
||||||
|
}
|
||||||
199
csrc/cpu/pos_encoding.cpp
Normal file
199
csrc/cpu/pos_encoding.cpp
Normal file
@ -0,0 +1,199 @@
|
|||||||
|
|
||||||
|
#include "cpu_types.hpp"
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
template <typename scalar_t>
|
||||||
|
void rotary_embedding_impl(
|
||||||
|
const int64_t* __restrict__ positions, // [batch_size, seq_len] or
|
||||||
|
// [num_tokens]
|
||||||
|
scalar_t* __restrict__ query, /// [batch_size, seq_len, num_heads,
|
||||||
|
/// head_size] or [num_tokens, num_heads,
|
||||||
|
/// head_size]
|
||||||
|
scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads,
|
||||||
|
// head_size] or [num_tokens, num_kv_heads,
|
||||||
|
// head_size]
|
||||||
|
const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim //
|
||||||
|
// 2]
|
||||||
|
const int rot_dim, const int64_t query_stride, const int64_t key_stride,
|
||||||
|
const int num_heads, const int num_kv_heads, const int head_size,
|
||||||
|
const int num_tokens) {
|
||||||
|
using scalar_vec_t = vec_op::vec_t<scalar_t>;
|
||||||
|
constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
|
||||||
|
|
||||||
|
const int embed_dim = rot_dim / 2;
|
||||||
|
bool flag = (embed_dim % VEC_ELEM_NUM == 0);
|
||||||
|
const int loop_upper = flag ? embed_dim : embed_dim - VEC_ELEM_NUM;
|
||||||
|
|
||||||
|
auto compute_loop = [&](const int64_t token_head, const scalar_t* cache_ptr,
|
||||||
|
scalar_t* qk) {
|
||||||
|
int j = 0;
|
||||||
|
for (; j < loop_upper; j += VEC_ELEM_NUM) {
|
||||||
|
const int rot_offset = j;
|
||||||
|
const int x_index = rot_offset;
|
||||||
|
const int y_index = embed_dim + rot_offset;
|
||||||
|
|
||||||
|
const int64_t out_x = token_head + x_index;
|
||||||
|
const int64_t out_y = token_head + y_index;
|
||||||
|
|
||||||
|
const scalar_vec_t cos(cache_ptr + x_index);
|
||||||
|
const scalar_vec_t sin(cache_ptr + y_index);
|
||||||
|
|
||||||
|
const scalar_vec_t q_x(qk + out_x);
|
||||||
|
const scalar_vec_t q_y(qk + out_y);
|
||||||
|
|
||||||
|
vec_op::FP32Vec8 fp32_cos(cos);
|
||||||
|
vec_op::FP32Vec8 fp32_sin(sin);
|
||||||
|
|
||||||
|
vec_op::FP32Vec8 fp32_q_x(q_x);
|
||||||
|
vec_op::FP32Vec8 fp32_q_y(q_y);
|
||||||
|
|
||||||
|
auto out1 = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
|
||||||
|
scalar_vec_t(out1).save(qk + out_x);
|
||||||
|
|
||||||
|
auto out2 = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
|
||||||
|
scalar_vec_t(out2).save(qk + out_y);
|
||||||
|
}
|
||||||
|
if (!flag) {
|
||||||
|
for (; j < embed_dim; ++j) {
|
||||||
|
const int x_index = j;
|
||||||
|
const int y_index = embed_dim + j;
|
||||||
|
|
||||||
|
const int64_t out_x = token_head + x_index;
|
||||||
|
const int64_t out_y = token_head + y_index;
|
||||||
|
|
||||||
|
const float fp32_cos = cache_ptr[x_index];
|
||||||
|
const float fp32_sin = cache_ptr[y_index];
|
||||||
|
|
||||||
|
const float fp32_q_x = qk[out_x];
|
||||||
|
const float fp32_q_y = qk[out_y];
|
||||||
|
|
||||||
|
qk[out_x] = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
|
||||||
|
qk[out_y] = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#pragma omp parallel for
|
||||||
|
for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
|
||||||
|
int64_t pos = positions[token_idx];
|
||||||
|
const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
|
||||||
|
|
||||||
|
for (int i = 0; i < num_heads; ++i) {
|
||||||
|
const int head_idx = i;
|
||||||
|
const int64_t token_head =
|
||||||
|
token_idx * query_stride + head_idx * head_size;
|
||||||
|
compute_loop(token_head, cache_ptr, query);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < num_kv_heads; ++i) {
|
||||||
|
const int head_idx = i;
|
||||||
|
const int64_t token_head = token_idx * key_stride + head_idx * head_size;
|
||||||
|
compute_loop(token_head, cache_ptr, key);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename scalar_t>
|
||||||
|
void rotary_embedding_gptj_impl(
|
||||||
|
const int64_t* __restrict__ positions, // [batch_size, seq_len] or
|
||||||
|
// [num_tokens]
|
||||||
|
scalar_t* __restrict__ query, /// [batch_size, seq_len, num_heads,
|
||||||
|
/// head_size] or [num_tokens, num_heads,
|
||||||
|
/// head_size]
|
||||||
|
scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads,
|
||||||
|
// head_size] or [num_tokens, num_kv_heads,
|
||||||
|
// head_size]
|
||||||
|
const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim //
|
||||||
|
// 2]
|
||||||
|
const int rot_dim, const int64_t query_stride, const int64_t key_stride,
|
||||||
|
const int num_heads, const int num_kv_heads, const int head_size,
|
||||||
|
const int num_tokens) {
|
||||||
|
const int embed_dim = rot_dim / 2;
|
||||||
|
|
||||||
|
#pragma omp parallel for collapse(2)
|
||||||
|
for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
|
||||||
|
for (int i = 0; i < num_heads; ++i) {
|
||||||
|
int64_t pos = positions[token_idx];
|
||||||
|
const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
|
||||||
|
const scalar_t* cos_cache_ptr = cache_ptr;
|
||||||
|
const scalar_t* sin_cache_ptr = cache_ptr + embed_dim;
|
||||||
|
const int head_idx = i;
|
||||||
|
const int64_t token_head =
|
||||||
|
token_idx * query_stride + head_idx * head_size;
|
||||||
|
scalar_t* head_query = token_head + query;
|
||||||
|
for (int j = 0; j < embed_dim; j += 1) {
|
||||||
|
const int rot_offset = j;
|
||||||
|
const int x_index = 2 * rot_offset;
|
||||||
|
const int y_index = 2 * rot_offset + 1;
|
||||||
|
|
||||||
|
const float cos = cos_cache_ptr[rot_offset];
|
||||||
|
const float sin = sin_cache_ptr[rot_offset];
|
||||||
|
|
||||||
|
const float x = head_query[x_index];
|
||||||
|
const float y = head_query[y_index];
|
||||||
|
|
||||||
|
head_query[x_index] = x * cos - y * sin;
|
||||||
|
head_query[y_index] = y * cos + x * sin;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#pragma omp parallel for collapse(2)
|
||||||
|
for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
|
||||||
|
for (int i = 0; i < num_kv_heads; ++i) {
|
||||||
|
int64_t pos = positions[token_idx];
|
||||||
|
const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
|
||||||
|
const scalar_t* cos_cache_ptr = cache_ptr;
|
||||||
|
const scalar_t* sin_cache_ptr = cache_ptr + embed_dim;
|
||||||
|
const int head_idx = i;
|
||||||
|
const int64_t token_head = token_idx * key_stride + head_idx * head_size;
|
||||||
|
scalar_t* head_key = key + token_head;
|
||||||
|
for (int j = 0; j < embed_dim; j += 1) {
|
||||||
|
const int rot_offset = j;
|
||||||
|
const int x_index = 2 * rot_offset;
|
||||||
|
const int y_index = 2 * rot_offset + 1;
|
||||||
|
|
||||||
|
const float cos = cos_cache_ptr[rot_offset];
|
||||||
|
const float sin = sin_cache_ptr[rot_offset];
|
||||||
|
|
||||||
|
const float x = head_key[x_index];
|
||||||
|
const float y = head_key[y_index];
|
||||||
|
|
||||||
|
head_key[x_index] = x * cos - y * sin;
|
||||||
|
head_key[y_index] = y * cos + x * sin;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}; // namespace
|
||||||
|
|
||||||
|
void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
|
||||||
|
torch::Tensor& key, int64_t head_size,
|
||||||
|
torch::Tensor& cos_sin_cache, bool is_neox) {
|
||||||
|
int num_tokens = query.numel() / query.size(-1);
|
||||||
|
int rot_dim = cos_sin_cache.size(1);
|
||||||
|
int num_heads = query.size(-1) / head_size;
|
||||||
|
int num_kv_heads = key.size(-1) / head_size;
|
||||||
|
int64_t key_stride = key.stride(-2);
|
||||||
|
int64_t query_stride = query.stride(-2);
|
||||||
|
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(
|
||||||
|
query.scalar_type(), "rotary_embedding_impl", [&] {
|
||||||
|
CPU_KERNEL_GUARD_IN(rotary_embedding_impl)
|
||||||
|
if (is_neox) {
|
||||||
|
rotary_embedding_impl(
|
||||||
|
positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
|
||||||
|
key.data_ptr<scalar_t>(), cos_sin_cache.data_ptr<scalar_t>(),
|
||||||
|
rot_dim, query_stride, key_stride, num_heads, num_kv_heads,
|
||||||
|
head_size, num_tokens);
|
||||||
|
} else {
|
||||||
|
rotary_embedding_gptj_impl(
|
||||||
|
positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
|
||||||
|
key.data_ptr<scalar_t>(), cos_sin_cache.data_ptr<scalar_t>(),
|
||||||
|
rot_dim, query_stride, key_stride, num_heads, num_kv_heads,
|
||||||
|
head_size, num_tokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
CPU_KERNEL_GUARD_OUT(rotary_embedding_impl)
|
||||||
|
});
|
||||||
|
}
|
||||||
106
csrc/cpu/torch_bindings.cpp
Normal file
106
csrc/cpu/torch_bindings.cpp
Normal file
@ -0,0 +1,106 @@
|
|||||||
|
#include "cache.h"
|
||||||
|
#include "ops.h"
|
||||||
|
#include "registration.h"
|
||||||
|
|
||||||
|
#include <torch/library.h>
|
||||||
|
|
||||||
|
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||||
|
// vLLM custom ops
|
||||||
|
|
||||||
|
// Attention ops
|
||||||
|
// Compute the attention between an input query and the cached keys/values
|
||||||
|
// using PagedAttention.
|
||||||
|
ops.def(
|
||||||
|
"paged_attention_v1("
|
||||||
|
" Tensor! out, Tensor query, Tensor key_cache,"
|
||||||
|
" Tensor value_cache, int num_kv_heads, float scale,"
|
||||||
|
" Tensor block_tables, Tensor seq_lens, int block_size,"
|
||||||
|
" int max_seq_len, Tensor? alibi_slopes,"
|
||||||
|
" str kv_cache_dtype, float kv_scale, int tp_rank,"
|
||||||
|
" int blocksparse_local_blocks,"
|
||||||
|
" int blocksparse_vert_stride, int blocksparse_block_size,"
|
||||||
|
" int blocksparse_head_sliding_step) -> ()");
|
||||||
|
ops.impl("paged_attention_v1", torch::kCPU, &paged_attention_v1);
|
||||||
|
|
||||||
|
// PagedAttention V2.
|
||||||
|
ops.def(
|
||||||
|
"paged_attention_v2("
|
||||||
|
" Tensor! out, Tensor exp_sums, Tensor max_logits,"
|
||||||
|
" Tensor tmp_out, Tensor query, Tensor key_cache,"
|
||||||
|
" Tensor value_cache, int num_kv_heads, float scale,"
|
||||||
|
" Tensor block_tables, Tensor seq_lens, int block_size,"
|
||||||
|
" int max_seq_len, Tensor? alibi_slopes,"
|
||||||
|
" str kv_cache_dtype, float kv_scale, int tp_rank,"
|
||||||
|
" int blocksparse_local_blocks,"
|
||||||
|
" int blocksparse_vert_stride, int blocksparse_block_size,"
|
||||||
|
" int blocksparse_head_sliding_step) -> ()");
|
||||||
|
ops.impl("paged_attention_v2", torch::kCPU, &paged_attention_v2);
|
||||||
|
|
||||||
|
// Activation ops
|
||||||
|
|
||||||
|
// Activation function used in SwiGLU.
|
||||||
|
ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
|
||||||
|
ops.impl("silu_and_mul", torch::kCPU, &silu_and_mul);
|
||||||
|
|
||||||
|
// Activation function used in GeGLU with `none` approximation.
|
||||||
|
ops.def("gelu_and_mul(Tensor! out, Tensor input) -> ()");
|
||||||
|
ops.impl("gelu_and_mul", torch::kCPU, &gelu_and_mul);
|
||||||
|
|
||||||
|
// Activation function used in GeGLU with `tanh` approximation.
|
||||||
|
ops.def("gelu_tanh_and_mul(Tensor! out, Tensor input) -> ()");
|
||||||
|
ops.impl("gelu_tanh_and_mul", torch::kCPU, &gelu_tanh_and_mul);
|
||||||
|
|
||||||
|
// GELU implementation used in GPT-2.
|
||||||
|
ops.def("gelu_new(Tensor! out, Tensor input) -> ()");
|
||||||
|
ops.impl("gelu_new", torch::kCPU, &gelu_new);
|
||||||
|
|
||||||
|
// Approximate GELU implementation.
|
||||||
|
ops.def("gelu_fast(Tensor! out, Tensor input) -> ()");
|
||||||
|
ops.impl("gelu_fast", torch::kCPU, &gelu_fast);
|
||||||
|
|
||||||
|
// Layernorm
|
||||||
|
// Apply Root Mean Square (RMS) Normalization to the input tensor.
|
||||||
|
ops.def(
|
||||||
|
"rms_norm(Tensor! out, Tensor input, Tensor weight, float epsilon) -> "
|
||||||
|
"()");
|
||||||
|
ops.impl("rms_norm", torch::kCPU, &rms_norm);
|
||||||
|
|
||||||
|
// In-place fused Add and RMS Normalization.
|
||||||
|
ops.def(
|
||||||
|
"fused_add_rms_norm(Tensor! input, Tensor! residual, Tensor weight, "
|
||||||
|
"float epsilon) -> ()");
|
||||||
|
ops.impl("fused_add_rms_norm", torch::kCPU, &fused_add_rms_norm);
|
||||||
|
|
||||||
|
// Rotary embedding
|
||||||
|
// Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
|
||||||
|
ops.def(
|
||||||
|
"rotary_embedding(Tensor positions, Tensor! query,"
|
||||||
|
" Tensor! key, int head_size,"
|
||||||
|
" Tensor cos_sin_cache, bool is_neox) -> ()");
|
||||||
|
ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
|
||||||
|
}
|
||||||
|
|
||||||
|
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
|
||||||
|
// Cache ops
|
||||||
|
// Swap in (out) the cache blocks from src to dst.
|
||||||
|
cache_ops.def(
|
||||||
|
"swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
|
||||||
|
cache_ops.impl("swap_blocks", torch::kCPU, &swap_blocks);
|
||||||
|
|
||||||
|
// Copy the cache blocks from src to dst.
|
||||||
|
cache_ops.def(
|
||||||
|
"copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor "
|
||||||
|
"block_mapping) -> ()");
|
||||||
|
cache_ops.impl("copy_blocks", torch::kCPU, ©_blocks);
|
||||||
|
|
||||||
|
// Reshape the key and value tensors and cache them.
|
||||||
|
cache_ops.def(
|
||||||
|
"reshape_and_cache(Tensor key, Tensor value,"
|
||||||
|
" Tensor! key_cache, Tensor! value_cache,"
|
||||||
|
" Tensor slot_mapping,"
|
||||||
|
" str kv_cache_dtype,"
|
||||||
|
" float kv_scale) -> ()");
|
||||||
|
cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
|
||||||
|
}
|
||||||
|
|
||||||
|
REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
|
||||||
49
csrc/cuda_compat.h
Normal file
49
csrc/cuda_compat.h
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#ifdef USE_ROCM
|
||||||
|
#include <hip/hip_runtime.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
#define WARP_SIZE 32
|
||||||
|
#else
|
||||||
|
#define WARP_SIZE warpSize
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
#define VLLM_LDG(arg) __ldg(arg)
|
||||||
|
#else
|
||||||
|
#define VLLM_LDG(arg) *(arg)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
#define VLLM_SHFL_XOR_SYNC(var, lane_mask) \
|
||||||
|
__shfl_xor_sync(uint32_t(-1), var, lane_mask)
|
||||||
|
#define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
|
||||||
|
__shfl_xor_sync(uint32_t(-1), var, lane_mask, width)
|
||||||
|
#else
|
||||||
|
#define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask)
|
||||||
|
#define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
|
||||||
|
__shfl_xor(var, lane_mask, width)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
#define VLLM_SHFL_SYNC(var, src_lane) __shfl_sync(uint32_t(-1), var, src_lane)
|
||||||
|
#else
|
||||||
|
#define VLLM_SHFL_SYNC(var, src_lane) __shfl(var, src_lane)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
#define VLLM_SHFL_DOWN_SYNC(var, lane_delta) \
|
||||||
|
__shfl_down_sync(uint32_t(-1), var, lane_delta)
|
||||||
|
#else
|
||||||
|
#define VLLM_SHFL_DOWN_SYNC(var, lane_delta) __shfl_down(var, lane_delta)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
#define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
|
||||||
|
cudaFuncSetAttribute(FUNC, cudaFuncAttributeMaxDynamicSharedMemorySize, VAL)
|
||||||
|
#else
|
||||||
|
#define VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(FUNC, VAL) \
|
||||||
|
hipFuncSetAttribute(FUNC, hipFuncAttributeMaxDynamicSharedMemorySize, VAL)
|
||||||
|
#endif
|
||||||
@ -1,13 +0,0 @@
|
|||||||
#include <torch/extension.h>
|
|
||||||
|
|
||||||
int get_device_attribute(
|
|
||||||
int attribute,
|
|
||||||
int device_id);
|
|
||||||
|
|
||||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
|
||||||
m.def(
|
|
||||||
"get_device_attribute",
|
|
||||||
&get_device_attribute,
|
|
||||||
"Gets the specified device attribute.");
|
|
||||||
}
|
|
||||||
|
|
||||||
5
csrc/cuda_utils.h
Normal file
5
csrc/cuda_utils.h
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
int64_t get_device_attribute(int64_t attribute, int64_t device_id);
|
||||||
|
|
||||||
|
int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id);
|
||||||
@ -1,14 +1,29 @@
|
|||||||
int get_device_attribute(
|
#ifdef USE_ROCM
|
||||||
int attribute,
|
#include <hip/hip_runtime.h>
|
||||||
int device_id)
|
#include <hip/hip_runtime_api.h>
|
||||||
{
|
#endif
|
||||||
int device, value;
|
int64_t get_device_attribute(int64_t attribute, int64_t device_id) {
|
||||||
if (device_id < 0) {
|
int device, value;
|
||||||
cudaGetDevice(&device);
|
if (device_id < 0) {
|
||||||
}
|
cudaGetDevice(&device);
|
||||||
else {
|
} else {
|
||||||
device = device_id;
|
device = device_id;
|
||||||
}
|
}
|
||||||
cudaDeviceGetAttribute(&value, static_cast<cudaDeviceAttr>(attribute), device);
|
cudaDeviceGetAttribute(&value, static_cast<cudaDeviceAttr>(attribute),
|
||||||
return value;
|
device);
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id) {
|
||||||
|
int64_t attribute;
|
||||||
|
// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
|
||||||
|
// cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74
|
||||||
|
|
||||||
|
#ifdef USE_ROCM
|
||||||
|
attribute = hipDeviceAttributeMaxSharedMemoryPerBlock;
|
||||||
|
#else
|
||||||
|
attribute = cudaDevAttrMaxSharedMemoryPerBlockOptin;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return get_device_attribute(attribute, device_id);
|
||||||
}
|
}
|
||||||
|
|||||||
153
csrc/custom_all_reduce.cu
Normal file
153
csrc/custom_all_reduce.cu
Normal file
@ -0,0 +1,153 @@
|
|||||||
|
#include <ATen/cuda/Exceptions.h>
|
||||||
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
#include <c10/cuda/CUDAStream.h>
|
||||||
|
#include <torch/all.h>
|
||||||
|
|
||||||
|
#include "custom_all_reduce.cuh"
|
||||||
|
|
||||||
|
// fake pointer type, must match fptr_t type in ops.h
|
||||||
|
using fptr_t = int64_t;
|
||||||
|
static_assert(sizeof(void*) == sizeof(fptr_t));
|
||||||
|
|
||||||
|
fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data,
|
||||||
|
const std::vector<std::string>& handles,
|
||||||
|
const std::vector<int64_t>& offsets, int64_t rank,
|
||||||
|
bool full_nvlink) {
|
||||||
|
int world_size = offsets.size();
|
||||||
|
if (world_size > 8)
|
||||||
|
throw std::invalid_argument("world size > 8 is not supported");
|
||||||
|
if (world_size % 2 != 0)
|
||||||
|
throw std::invalid_argument("Odd num gpus is not supported for now");
|
||||||
|
if (world_size != handles.size())
|
||||||
|
throw std::invalid_argument(
|
||||||
|
"handles length should equal to offsets length");
|
||||||
|
if (rank < 0 || rank >= world_size)
|
||||||
|
throw std::invalid_argument("invalid rank passed in");
|
||||||
|
|
||||||
|
cudaIpcMemHandle_t ipc_handles[8];
|
||||||
|
for (int i = 0; i < world_size; i++) {
|
||||||
|
std::memcpy(&ipc_handles[i], handles[i].data(), sizeof(cudaIpcMemHandle_t));
|
||||||
|
}
|
||||||
|
return (fptr_t) new vllm::CustomAllreduce(
|
||||||
|
reinterpret_cast<vllm::Signal*>(meta.data_ptr()), rank_data.data_ptr(),
|
||||||
|
rank_data.numel(), ipc_handles, offsets, rank, full_nvlink);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Make sure tensor t's data lies completely within ((char)t.data_ptr()) +
|
||||||
|
* t.numel() * t.element_size(). This is slightly weaker than t.is_contiguous()
|
||||||
|
* because it allows transpose of contiguous slice (i.e. slicing the first
|
||||||
|
* dimension). Currently, we require this because stride information is not
|
||||||
|
* passed into the kernels and we treat input tensors as flat.
|
||||||
|
*
|
||||||
|
* Examples
|
||||||
|
* A = torch.zeros(3, 3, 3)
|
||||||
|
* 1. A: OK
|
||||||
|
* 2. A[1:]: OK
|
||||||
|
* 3. A.permute(2, 0, 1): OK
|
||||||
|
* 4. A[1:].permute(2, 0, 1): OK
|
||||||
|
* 5. A[None].expand(2, -1, -1, -1): Not OK
|
||||||
|
* 6. A[:, 1:, 1:]: Not OK
|
||||||
|
*/
|
||||||
|
bool _is_weak_contiguous(torch::Tensor& t) {
|
||||||
|
return t.is_contiguous() ||
|
||||||
|
(t.storage().nbytes() - t.storage_offset() * t.element_size() ==
|
||||||
|
t.numel() * t.element_size());
|
||||||
|
}
|
||||||
|
|
||||||
|
bool should_custom_ar(torch::Tensor& inp, int64_t max_size, int64_t world_size,
|
||||||
|
bool full_nvlink) {
|
||||||
|
auto inp_size = inp.numel() * inp.element_size();
|
||||||
|
// custom allreduce requires input byte size to be multiples of 16
|
||||||
|
if (inp_size % 16 != 0) return false;
|
||||||
|
if (!_is_weak_contiguous(inp)) return false;
|
||||||
|
if (world_size == 2 || full_nvlink) return inp_size <= max_size;
|
||||||
|
// for 4 or more non NVLink-capable GPUs, custom allreduce provides little
|
||||||
|
// performance improvement over NCCL.
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
void _all_reduce(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out,
|
||||||
|
cudaStream_t stream) {
|
||||||
|
auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
|
||||||
|
TORCH_CHECK(_is_weak_contiguous(out));
|
||||||
|
switch (out.scalar_type()) {
|
||||||
|
case at::ScalarType::Float: {
|
||||||
|
fa->allreduce<float>(stream, reinterpret_cast<float*>(inp.data_ptr()),
|
||||||
|
reinterpret_cast<float*>(out.data_ptr()),
|
||||||
|
out.numel());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case at::ScalarType::Half: {
|
||||||
|
fa->allreduce<half>(stream, reinterpret_cast<half*>(inp.data_ptr()),
|
||||||
|
reinterpret_cast<half*>(out.data_ptr()), out.numel());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
|
||||||
|
case at::ScalarType::BFloat16: {
|
||||||
|
fa->allreduce<nv_bfloat16>(
|
||||||
|
stream, reinterpret_cast<nv_bfloat16*>(inp.data_ptr()),
|
||||||
|
reinterpret_cast<nv_bfloat16*>(out.data_ptr()), out.numel());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
default:
|
||||||
|
throw std::runtime_error(
|
||||||
|
"custom allreduce only supports float32, float16 and bfloat16");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void all_reduce_reg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out) {
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
|
||||||
|
auto stream = c10::cuda::getCurrentCUDAStream().stream();
|
||||||
|
TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
|
||||||
|
TORCH_CHECK_EQ(inp.numel(), out.numel());
|
||||||
|
_all_reduce(_fa, inp, out, stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
void all_reduce_unreg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& reg_buffer,
|
||||||
|
torch::Tensor& out) {
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
|
||||||
|
auto stream = c10::cuda::getCurrentCUDAStream().stream();
|
||||||
|
|
||||||
|
auto input_size = inp.numel() * inp.element_size();
|
||||||
|
TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
|
||||||
|
TORCH_CHECK_EQ(inp.numel(), out.numel());
|
||||||
|
TORCH_CHECK(input_size <= reg_buffer.numel() * reg_buffer.element_size(),
|
||||||
|
"registered buffer is too small to contain the input");
|
||||||
|
AT_CUDA_CHECK(cudaMemcpyAsync(reg_buffer.data_ptr(), inp.data_ptr(),
|
||||||
|
input_size, cudaMemcpyDeviceToDevice, stream));
|
||||||
|
_all_reduce(_fa, reg_buffer, out, stream);
|
||||||
|
}
|
||||||
|
|
||||||
|
void dispose(fptr_t _fa) {
|
||||||
|
auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
|
||||||
|
delete fa;
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t meta_size() { return sizeof(vllm::Signal); }
|
||||||
|
|
||||||
|
void register_buffer(fptr_t _fa, torch::Tensor& t,
|
||||||
|
const std::vector<std::string>& handles,
|
||||||
|
const std::vector<int64_t>& offsets) {
|
||||||
|
auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
|
||||||
|
fa->register_buffer(handles, offsets, t.data_ptr());
|
||||||
|
}
|
||||||
|
|
||||||
|
std::tuple<torch::Tensor, std::vector<int64_t>> get_graph_buffer_ipc_meta(
|
||||||
|
fptr_t _fa) {
|
||||||
|
auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
|
||||||
|
auto [handle_bytes, offsets] = fa->get_graph_buffer_ipc_meta();
|
||||||
|
auto options =
|
||||||
|
torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
|
||||||
|
auto handles =
|
||||||
|
torch::empty({static_cast<int64_t>(handle_bytes.size())}, options);
|
||||||
|
std::memcpy(handles.data_ptr(), handle_bytes.data(), handle_bytes.size());
|
||||||
|
return {handles, std::move(offsets)};
|
||||||
|
}
|
||||||
|
|
||||||
|
void register_graph_buffers(fptr_t _fa, const std::vector<std::string>& handles,
|
||||||
|
const std::vector<std::vector<int64_t>>& offsets) {
|
||||||
|
auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
|
||||||
|
fa->register_graph_buffers(handles, offsets);
|
||||||
|
}
|
||||||
482
csrc/custom_all_reduce.cuh
Normal file
482
csrc/custom_all_reduce.cuh
Normal file
@ -0,0 +1,482 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <cuda.h>
|
||||||
|
#include <cuda_bf16.h>
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <limits>
|
||||||
|
#include <map>
|
||||||
|
#include <unordered_map>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#define CUDACHECK(cmd) \
|
||||||
|
do { \
|
||||||
|
cudaError_t e = cmd; \
|
||||||
|
if (e != cudaSuccess) { \
|
||||||
|
printf("Failed: Cuda error %s:%d '%s'\n", __FILE__, __LINE__, \
|
||||||
|
cudaGetErrorString(e)); \
|
||||||
|
exit(EXIT_FAILURE); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
namespace vllm {
|
||||||
|
|
||||||
|
constexpr int kMaxBlocks = 64;
|
||||||
|
// note: we don't want to use atomics for signals because peer atomics are no
|
||||||
|
// supported on PCIe links
|
||||||
|
struct Signal {
|
||||||
|
alignas(128) uint32_t start[kMaxBlocks][8];
|
||||||
|
alignas(128) uint32_t end[kMaxBlocks][8];
|
||||||
|
};
|
||||||
|
|
||||||
|
struct __align__(16) RankData { const void* __restrict__ ptrs[8]; };
|
||||||
|
|
||||||
|
struct __align__(16) RankSignals { volatile Signal* signals[8]; };
|
||||||
|
|
||||||
|
// like std::array, but aligned
|
||||||
|
template <typename T, int sz>
|
||||||
|
struct __align__(alignof(T) * sz) array_t {
|
||||||
|
T data[sz];
|
||||||
|
using type = T;
|
||||||
|
static constexpr int size = sz;
|
||||||
|
};
|
||||||
|
|
||||||
|
// use packed type to maximize memory efficiency
|
||||||
|
// goal: generate ld.128 and st.128 instructions
|
||||||
|
template <typename T>
|
||||||
|
struct packed_t {
|
||||||
|
// the (P)acked type for load/store
|
||||||
|
using P = array_t<T, 16 / sizeof(T)>;
|
||||||
|
// the (A)ccumulator type for reduction
|
||||||
|
using A = array_t<float, 16 / sizeof(T)>;
|
||||||
|
};
|
||||||
|
|
||||||
|
#define DINLINE __device__ __forceinline__
|
||||||
|
|
||||||
|
// scalar cast functions
|
||||||
|
DINLINE float upcast_s(half val) { return __half2float(val); }
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
DINLINE T downcast_s(float val);
|
||||||
|
template <>
|
||||||
|
DINLINE half downcast_s(float val) {
|
||||||
|
return __float2half(val);
|
||||||
|
}
|
||||||
|
|
||||||
|
// scalar add functions
|
||||||
|
// for some reason when compiling with Pytorch, the + operator for half and
|
||||||
|
// bfloat is disabled so we call the intrinsics directly
|
||||||
|
DINLINE half& assign_add(half& a, half b) {
|
||||||
|
a = __hadd(a, b);
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
DINLINE float& assign_add(float& a, float b) { return a += b; }
|
||||||
|
|
||||||
|
#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
|
||||||
|
DINLINE float upcast_s(nv_bfloat16 val) { return __bfloat162float(val); }
|
||||||
|
template <>
|
||||||
|
DINLINE nv_bfloat16 downcast_s(float val) {
|
||||||
|
return __float2bfloat16(val);
|
||||||
|
}
|
||||||
|
DINLINE nv_bfloat16& assign_add(nv_bfloat16& a, nv_bfloat16 b) {
|
||||||
|
a = __hadd(a, b);
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
template <typename T, int N>
|
||||||
|
DINLINE array_t<T, N>& packed_assign_add(array_t<T, N>& a, array_t<T, N> b) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < N; i++) {
|
||||||
|
assign_add(a.data[i], b.data[i]);
|
||||||
|
}
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, int N>
|
||||||
|
DINLINE array_t<float, N> upcast(array_t<T, N> val) {
|
||||||
|
if constexpr (std::is_same<T, float>::value) {
|
||||||
|
return val;
|
||||||
|
} else {
|
||||||
|
array_t<float, N> out;
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < N; i++) {
|
||||||
|
out.data[i] = upcast_s(val.data[i]);
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename O>
|
||||||
|
DINLINE O downcast(array_t<float, O::size> val) {
|
||||||
|
if constexpr (std::is_same<typename O::type, float>::value) {
|
||||||
|
return val;
|
||||||
|
} else {
|
||||||
|
O out;
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < O::size; i++) {
|
||||||
|
out.data[i] = downcast_s<typename O::type>(val.data[i]);
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// This function is meant to be used as the first synchronization in the all
|
||||||
|
// reduce kernel. Thus, it doesn't need to make any visibility guarantees for
|
||||||
|
// prior memory accesses. Note: volatile writes will not be reordered against
|
||||||
|
// other volatile writes.
|
||||||
|
template <int ngpus>
|
||||||
|
DINLINE void start_sync(const RankSignals& sg, volatile Signal* self_sg,
|
||||||
|
int rank) {
|
||||||
|
if (threadIdx.x < ngpus) {
|
||||||
|
// reset flag for next time
|
||||||
|
self_sg->end[blockIdx.x][threadIdx.x] = 0;
|
||||||
|
// simultaneously write to the corresponding flag of all ranks.
|
||||||
|
// Latency = 1 p2p write
|
||||||
|
sg.signals[threadIdx.x]->start[blockIdx.x][rank] = 1;
|
||||||
|
// wait until we got true from all ranks
|
||||||
|
while (!self_sg->start[blockIdx.x][threadIdx.x]);
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
}
|
||||||
|
|
||||||
|
// This function is meant to be used as the second or the final synchronization
|
||||||
|
// barrier in the all reduce kernel. If it's the final synchronization barrier,
|
||||||
|
// we don't need to make any visibility guarantees for prior memory accesses.
|
||||||
|
template <int ngpus, bool final_sync = false>
|
||||||
|
DINLINE void end_sync(const RankSignals& sg, volatile Signal* self_sg,
|
||||||
|
int rank) {
|
||||||
|
__syncthreads();
|
||||||
|
// eliminate the case that prior writes are not visible after signals become
|
||||||
|
// visible. Note that I did not managed to make this happen through a lot of
|
||||||
|
// testing. Might be the case that hardware provides stronger guarantee than
|
||||||
|
// the memory model.
|
||||||
|
if constexpr (!final_sync) __threadfence_system();
|
||||||
|
if (threadIdx.x < ngpus) {
|
||||||
|
// reset flag for next time
|
||||||
|
self_sg->start[blockIdx.x][threadIdx.x] = 0;
|
||||||
|
// simultaneously write to the corresponding flag of all ranks.
|
||||||
|
// Latency = 1 p2p write
|
||||||
|
sg.signals[threadIdx.x]->end[blockIdx.x][rank] = 1;
|
||||||
|
// wait until we got true from all ranks
|
||||||
|
while (!self_sg->end[blockIdx.x][threadIdx.x]);
|
||||||
|
}
|
||||||
|
if constexpr (!final_sync) __syncthreads();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename P, int ngpus, typename A>
|
||||||
|
DINLINE P packed_reduce(const P* ptrs[], int idx) {
|
||||||
|
A tmp = upcast(ptrs[0][idx]);
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 1; i < ngpus; i++) {
|
||||||
|
packed_assign_add(tmp, upcast(ptrs[i][idx]));
|
||||||
|
}
|
||||||
|
return downcast<P>(tmp);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, int ngpus>
|
||||||
|
__global__ void __launch_bounds__(512, 1)
|
||||||
|
cross_device_reduce_1stage(RankData* _dp, RankSignals sg,
|
||||||
|
volatile Signal* self_sg, T* __restrict__ result,
|
||||||
|
int rank, int size) {
|
||||||
|
using P = typename packed_t<T>::P;
|
||||||
|
using A = typename packed_t<T>::A;
|
||||||
|
// note: we don't reorder the address so the accumulation order is the same
|
||||||
|
// for all ranks, ensuring bitwise identical results
|
||||||
|
auto dp = *_dp;
|
||||||
|
start_sync<ngpus>(sg, self_sg, rank);
|
||||||
|
// do the actual reduction
|
||||||
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
|
||||||
|
idx += gridDim.x * blockDim.x) {
|
||||||
|
((P*)result)[idx] = packed_reduce<P, ngpus, A>((const P**)&dp.ptrs[0], idx);
|
||||||
|
}
|
||||||
|
end_sync<ngpus, true>(sg, self_sg, rank);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename P>
|
||||||
|
DINLINE P* get_tmp_buf(volatile Signal* sg) {
|
||||||
|
return (P*)(((Signal*)sg) + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T, int ngpus>
|
||||||
|
__global__ void __launch_bounds__(512, 1)
|
||||||
|
cross_device_reduce_2stage(RankData* _dp, RankSignals sg,
|
||||||
|
volatile Signal* self_sg, T* __restrict__ result,
|
||||||
|
int rank, int size) {
|
||||||
|
int tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||||
|
int stride = gridDim.x * blockDim.x;
|
||||||
|
using P = typename packed_t<T>::P;
|
||||||
|
using A = typename packed_t<T>::A;
|
||||||
|
int part = size / ngpus;
|
||||||
|
int start = rank * part;
|
||||||
|
int end = rank == ngpus - 1 ? size : start + part;
|
||||||
|
int largest_part = part + size % ngpus;
|
||||||
|
const P* ptrs[ngpus];
|
||||||
|
P* tmps[ngpus];
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < ngpus; i++) {
|
||||||
|
int target = (rank + i) % ngpus;
|
||||||
|
ptrs[i] = (const P*)_dp->ptrs[target];
|
||||||
|
tmps[i] = get_tmp_buf<P>(sg.signals[target]);
|
||||||
|
}
|
||||||
|
auto tmp_out = tmps[0];
|
||||||
|
start_sync<ngpus>(sg, self_sg, rank);
|
||||||
|
// stage 1: reduce scatter
|
||||||
|
for (int idx = start + tid; idx < end; idx += stride) {
|
||||||
|
tmp_out[idx - start] = packed_reduce<P, ngpus, A>(ptrs, idx);
|
||||||
|
}
|
||||||
|
end_sync<ngpus>(sg, self_sg, rank);
|
||||||
|
|
||||||
|
// stage 2: allgather. Note: it's important to match the tid between
|
||||||
|
// the two stages, because visibility across devices is only guaranteed
|
||||||
|
// between threads that have the same tid. If thread i computes the sum of
|
||||||
|
// start + i in the first stage, then thread i also gathers start + i from all
|
||||||
|
// ranks.
|
||||||
|
for (int idx = tid; idx < largest_part; idx += stride) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < ngpus; i++) {
|
||||||
|
int gather_from_rank = ((rank + i) % ngpus);
|
||||||
|
if (gather_from_rank == ngpus - 1 || idx < part) {
|
||||||
|
int dst_idx = gather_from_rank * part + idx;
|
||||||
|
((P*)result)[dst_idx] = tmps[i][idx];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
using IPC_KEY = std::array<uint8_t, sizeof(cudaIpcMemHandle_t)>;
|
||||||
|
static_assert(sizeof(IPC_KEY) == sizeof(cudaIpcMemHandle_t));
|
||||||
|
static_assert(alignof(IPC_KEY) == alignof(cudaIpcMemHandle_t));
|
||||||
|
|
||||||
|
class CustomAllreduce {
|
||||||
|
public:
|
||||||
|
int rank_;
|
||||||
|
int world_size_;
|
||||||
|
bool full_nvlink_;
|
||||||
|
|
||||||
|
// below are device pointers
|
||||||
|
RankSignals sg_;
|
||||||
|
std::unordered_map<void*, RankData*> buffers_;
|
||||||
|
Signal* self_sg_;
|
||||||
|
|
||||||
|
// stores the registered device pointers from all ranks
|
||||||
|
RankData *d_rank_data_base_, *d_rank_data_end_;
|
||||||
|
std::vector<void*> graph_unreg_buffers_;
|
||||||
|
// a map from IPC handles to opened IPC pointers
|
||||||
|
std::map<IPC_KEY, char*> ipc_handles_;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* meta is a pointer to device metadata and temporary buffer for allreduce.
|
||||||
|
*
|
||||||
|
* There's a total of sizeof(Signal) of prefix before the actual data,
|
||||||
|
* so meta + 1 points to actual temporary buffer.
|
||||||
|
*
|
||||||
|
* note: this class does not own any device memory. Any required buffers
|
||||||
|
* are passed in from the constructor
|
||||||
|
*/
|
||||||
|
CustomAllreduce(Signal* meta, void* rank_data, size_t rank_data_sz,
|
||||||
|
const cudaIpcMemHandle_t* handles,
|
||||||
|
const std::vector<int64_t>& offsets, int rank,
|
||||||
|
bool full_nvlink = true)
|
||||||
|
: rank_(rank),
|
||||||
|
world_size_(offsets.size()),
|
||||||
|
full_nvlink_(full_nvlink),
|
||||||
|
self_sg_(meta),
|
||||||
|
d_rank_data_base_(reinterpret_cast<RankData*>(rank_data)),
|
||||||
|
d_rank_data_end_(d_rank_data_base_ + rank_data_sz / sizeof(RankData)) {
|
||||||
|
for (int i = 0; i < world_size_; i++) {
|
||||||
|
Signal* rank_sg;
|
||||||
|
if (i != rank_) {
|
||||||
|
char* handle = open_ipc_handle(&handles[i]);
|
||||||
|
handle += offsets[i];
|
||||||
|
rank_sg = (Signal*)handle;
|
||||||
|
} else {
|
||||||
|
rank_sg = self_sg_;
|
||||||
|
}
|
||||||
|
sg_.signals[i] = rank_sg;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
char* open_ipc_handle(const void* ipc_handle) {
|
||||||
|
auto [it, new_handle] =
|
||||||
|
ipc_handles_.insert({*((IPC_KEY*)ipc_handle), nullptr});
|
||||||
|
if (new_handle) {
|
||||||
|
char* ipc_ptr;
|
||||||
|
CUDACHECK(cudaIpcOpenMemHandle((void**)&ipc_ptr,
|
||||||
|
*((const cudaIpcMemHandle_t*)ipc_handle),
|
||||||
|
cudaIpcMemLazyEnablePeerAccess));
|
||||||
|
it->second = ipc_ptr;
|
||||||
|
}
|
||||||
|
return it->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::pair<std::vector<uint8_t>, std::vector<int64_t>>
|
||||||
|
get_graph_buffer_ipc_meta() {
|
||||||
|
auto num_buffers = graph_unreg_buffers_.size();
|
||||||
|
auto handle_sz = sizeof(cudaIpcMemHandle_t);
|
||||||
|
std::vector<uint8_t> handles(handle_sz * num_buffers, 0);
|
||||||
|
std::vector<int64_t> offsets(num_buffers);
|
||||||
|
for (int i = 0; i < num_buffers; i++) {
|
||||||
|
auto ptr = graph_unreg_buffers_[i];
|
||||||
|
void* base_ptr;
|
||||||
|
// note: must share the base address of each allocation, or we get wrong
|
||||||
|
// address
|
||||||
|
if (cuPointerGetAttribute(&base_ptr,
|
||||||
|
CU_POINTER_ATTRIBUTE_RANGE_START_ADDR,
|
||||||
|
(CUdeviceptr)ptr) != CUDA_SUCCESS)
|
||||||
|
throw std::runtime_error("failed to get pointer attr");
|
||||||
|
CUDACHECK(cudaIpcGetMemHandle(
|
||||||
|
(cudaIpcMemHandle_t*)&handles[i * handle_sz], base_ptr));
|
||||||
|
offsets[i] = ((char*)ptr) - ((char*)base_ptr);
|
||||||
|
}
|
||||||
|
return std::make_pair(handles, offsets);
|
||||||
|
}
|
||||||
|
|
||||||
|
void check_rank_data_capacity(size_t num = 1) {
|
||||||
|
if (d_rank_data_base_ + num > d_rank_data_end_)
|
||||||
|
throw std::runtime_error(
|
||||||
|
"Rank data buffer is overflowed by " +
|
||||||
|
std::to_string(d_rank_data_base_ + num - d_rank_data_end_));
|
||||||
|
}
|
||||||
|
|
||||||
|
void register_buffer(const std::vector<std::string>& handles,
|
||||||
|
const std::vector<int64_t>& offsets, void* self) {
|
||||||
|
check_rank_data_capacity();
|
||||||
|
RankData data;
|
||||||
|
for (int i = 0; i < world_size_; i++) {
|
||||||
|
if (i != rank_) {
|
||||||
|
char* handle = open_ipc_handle(handles[i].data());
|
||||||
|
handle += offsets[i];
|
||||||
|
data.ptrs[i] = handle;
|
||||||
|
} else {
|
||||||
|
data.ptrs[i] = self;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
auto d_data = d_rank_data_base_++;
|
||||||
|
CUDACHECK(
|
||||||
|
cudaMemcpy(d_data, &data, sizeof(RankData), cudaMemcpyHostToDevice));
|
||||||
|
buffers_[self] = d_data;
|
||||||
|
}
|
||||||
|
|
||||||
|
// note: when registering graph buffers, we intentionally choose to not
|
||||||
|
// deduplicate the addresses. That means if the allocator reuses some
|
||||||
|
// addresses, they will be registered again. This is to account for the remote
|
||||||
|
// possibility of different allocation patterns between ranks. For example,
|
||||||
|
// rank 1 may get the same input address for the second allreduce, but rank 2
|
||||||
|
// got a different address. IPC handles have internal reference counting
|
||||||
|
// mechanism so overhead should be small.
|
||||||
|
void register_graph_buffers(
|
||||||
|
const std::vector<std::string>& handles,
|
||||||
|
const std::vector<std::vector<int64_t>>& offsets) {
|
||||||
|
auto num_buffers = graph_unreg_buffers_.size();
|
||||||
|
check_rank_data_capacity(num_buffers);
|
||||||
|
std::vector<RankData> rank_data(num_buffers);
|
||||||
|
for (int i = 0; i < num_buffers; i++) {
|
||||||
|
auto self_ptr = graph_unreg_buffers_[i];
|
||||||
|
auto& rd = rank_data[i];
|
||||||
|
for (int j = 0; j < world_size_; j++) {
|
||||||
|
if (j != rank_) {
|
||||||
|
char* handle =
|
||||||
|
open_ipc_handle(&handles[j][i * sizeof(cudaIpcMemHandle_t)]);
|
||||||
|
handle += offsets[j][i];
|
||||||
|
rd.ptrs[j] = handle;
|
||||||
|
} else {
|
||||||
|
rd.ptrs[j] = self_ptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
CUDACHECK(cudaMemcpy(d_rank_data_base_, rank_data.data(),
|
||||||
|
sizeof(RankData) * num_buffers,
|
||||||
|
cudaMemcpyHostToDevice));
|
||||||
|
d_rank_data_base_ += num_buffers;
|
||||||
|
graph_unreg_buffers_.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This is the result after careful grid search. Using 36 blocks give the best
|
||||||
|
* or close to the best runtime on the devices I tried: A100, A10, A30, T4,
|
||||||
|
* V100. You'll notice that NCCL kernels also only take a small amount of SMs.
|
||||||
|
* Not quite sure the underlying reason, but my guess is that too many SMs
|
||||||
|
* will cause contention on NVLink bus.
|
||||||
|
*/
|
||||||
|
template <typename T>
|
||||||
|
void allreduce(cudaStream_t stream, T* input, T* output, int size,
|
||||||
|
int threads = 512, int block_limit = 36) {
|
||||||
|
auto d = packed_t<T>::P::size;
|
||||||
|
if (size % d != 0)
|
||||||
|
throw std::runtime_error(
|
||||||
|
"custom allreduce currently requires input length to be multiple "
|
||||||
|
"of " +
|
||||||
|
std::to_string(d));
|
||||||
|
if (block_limit > kMaxBlocks)
|
||||||
|
throw std::runtime_error("max supported block limit is " +
|
||||||
|
std::to_string(kMaxBlocks) + ". Got " +
|
||||||
|
std::to_string(block_limit));
|
||||||
|
|
||||||
|
RankData* ptrs;
|
||||||
|
cudaStreamCaptureStatus status;
|
||||||
|
CUDACHECK(cudaStreamIsCapturing(stream, &status));
|
||||||
|
if (status == cudaStreamCaptureStatusActive) {
|
||||||
|
ptrs = d_rank_data_base_ + graph_unreg_buffers_.size();
|
||||||
|
graph_unreg_buffers_.push_back(input);
|
||||||
|
} else {
|
||||||
|
auto it = buffers_.find(input);
|
||||||
|
if (it == buffers_.end())
|
||||||
|
throw std::runtime_error(
|
||||||
|
"buffer address " +
|
||||||
|
std::to_string(reinterpret_cast<uint64_t>(input)) +
|
||||||
|
" is not registered!");
|
||||||
|
ptrs = it->second;
|
||||||
|
}
|
||||||
|
|
||||||
|
size /= d;
|
||||||
|
auto bytes = size * sizeof(typename packed_t<T>::P);
|
||||||
|
int blocks = std::min(block_limit, (size + threads - 1) / threads);
|
||||||
|
#define KL(ngpus, name) \
|
||||||
|
name<T, ngpus><<<blocks, threads, 0, stream>>>(ptrs, sg_, self_sg_, output, \
|
||||||
|
rank_, size);
|
||||||
|
#define REDUCE_CASE(ngpus) \
|
||||||
|
case ngpus: { \
|
||||||
|
if (world_size_ == 2) { \
|
||||||
|
KL(ngpus, cross_device_reduce_1stage); \
|
||||||
|
} else if (full_nvlink_) { \
|
||||||
|
if ((world_size_ <= 4 && bytes < 512 * 1024) || \
|
||||||
|
(world_size_ <= 8 && bytes < 256 * 1024)) { \
|
||||||
|
KL(ngpus, cross_device_reduce_1stage); \
|
||||||
|
} else { \
|
||||||
|
KL(ngpus, cross_device_reduce_2stage); \
|
||||||
|
} \
|
||||||
|
} \
|
||||||
|
break; \
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (world_size_) {
|
||||||
|
REDUCE_CASE(2)
|
||||||
|
REDUCE_CASE(4)
|
||||||
|
REDUCE_CASE(6)
|
||||||
|
REDUCE_CASE(8)
|
||||||
|
default:
|
||||||
|
throw std::runtime_error(
|
||||||
|
"custom allreduce only supports num gpus in (2,4,6,8). Actual num "
|
||||||
|
"gpus = " +
|
||||||
|
std::to_string(world_size_));
|
||||||
|
}
|
||||||
|
#undef REDUCE_CASE
|
||||||
|
#undef KL
|
||||||
|
}
|
||||||
|
|
||||||
|
~CustomAllreduce() {
|
||||||
|
for (auto [_, ptr] : ipc_handles_) {
|
||||||
|
CUDACHECK(cudaIpcCloseMemHandle(ptr));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
/**
|
||||||
|
* To inspect PTX/SASS, copy paste this header file to compiler explorer and add
|
||||||
|
a template instantiation:
|
||||||
|
* template void vllm::CustomAllreduce::allreduce<half>(cudaStream_t, half *,
|
||||||
|
half *, int, int, int);
|
||||||
|
*/
|
||||||
|
} // namespace vllm
|
||||||
316
csrc/custom_all_reduce_test.cu
Normal file
316
csrc/custom_all_reduce_test.cu
Normal file
@ -0,0 +1,316 @@
|
|||||||
|
/**
|
||||||
|
* This is a standalone test for custom allreduce.
|
||||||
|
* To compile, make sure you have MPI and NCCL installed in your system.
|
||||||
|
* export MPI_HOME=XXX
|
||||||
|
* nvcc -O2 -arch=native -std=c++17 custom_all_reduce_test.cu -o
|
||||||
|
* custom_all_reduce_test -lnccl -I${MPI_HOME}/include -lmpi
|
||||||
|
*
|
||||||
|
* Warning: this C++ test is not designed to be very readable and was used
|
||||||
|
* during the rapid prototyping process.
|
||||||
|
*
|
||||||
|
* To run:
|
||||||
|
* mpirun -np 8 ./custom_all_reduce_test
|
||||||
|
*/
|
||||||
|
#include <cuda.h>
|
||||||
|
#include <curand_kernel.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
|
||||||
|
#include <limits>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
#include "cuda_profiler_api.h"
|
||||||
|
#include "custom_all_reduce.cuh"
|
||||||
|
#include "mpi.h"
|
||||||
|
#include "nccl.h"
|
||||||
|
|
||||||
|
#define MPICHECK(cmd) \
|
||||||
|
do { \
|
||||||
|
int e = cmd; \
|
||||||
|
if (e != MPI_SUCCESS) { \
|
||||||
|
printf("Failed: MPI error %s:%d '%d'\n", __FILE__, __LINE__, e); \
|
||||||
|
exit(EXIT_FAILURE); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
#define NCCLCHECK(cmd) \
|
||||||
|
do { \
|
||||||
|
ncclResult_t r = cmd; \
|
||||||
|
if (r != ncclSuccess) { \
|
||||||
|
printf("Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, \
|
||||||
|
ncclGetErrorString(r)); \
|
||||||
|
exit(EXIT_FAILURE); \
|
||||||
|
} \
|
||||||
|
} while (0)
|
||||||
|
|
||||||
|
__global__ void dummy_kernel() {
|
||||||
|
for (int i = 0; i < 100; i++) __nanosleep(1000000); // 100ms
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
__global__ void set_data(T* data, int size, int myRank) {
|
||||||
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
|
||||||
|
idx += gridDim.x * blockDim.x) {
|
||||||
|
data[idx] = myRank * 0.11f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
__global__ void convert_data(const T* data1, const T* data2, double* fdata1,
|
||||||
|
double* fdata2, int size) {
|
||||||
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
|
||||||
|
idx += gridDim.x * blockDim.x) {
|
||||||
|
fdata1[idx] = data1[idx];
|
||||||
|
fdata2[idx] = data2[idx];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void init_rand(curandState_t* state, int size, int nRanks) {
|
||||||
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
|
||||||
|
idx += gridDim.x * blockDim.x) {
|
||||||
|
for (int i = 0; i < nRanks; i++) {
|
||||||
|
curand_init(i + 1, idx, 0, &state[idx * nRanks + i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
__global__ void gen_data(curandState_t* state, T* data, double* ground_truth,
|
||||||
|
int myRank, int nRanks, int size) {
|
||||||
|
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size;
|
||||||
|
idx += gridDim.x * blockDim.x) {
|
||||||
|
double sum = 0.0;
|
||||||
|
for (int i = 0; i < nRanks; i++) {
|
||||||
|
double val = curand_uniform_double(&state[idx * nRanks + i]) * 4;
|
||||||
|
T hval = val; // downcast first
|
||||||
|
sum += static_cast<double>(hval);
|
||||||
|
if (i == myRank) data[idx] = hval;
|
||||||
|
}
|
||||||
|
ground_truth[idx] = sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void run(int myRank, int nRanks, ncclComm_t& comm, int threads, int block_limit,
|
||||||
|
int data_size, bool performance_test) {
|
||||||
|
T* result;
|
||||||
|
cudaStream_t stream;
|
||||||
|
CUDACHECK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
|
||||||
|
CUDACHECK(cudaMalloc(&result, data_size * sizeof(T)));
|
||||||
|
CUDACHECK(cudaMemset(result, 0, data_size * sizeof(T)));
|
||||||
|
|
||||||
|
cudaIpcMemHandle_t self_data_handle;
|
||||||
|
cudaIpcMemHandle_t data_handles[8];
|
||||||
|
vllm::Signal* buffer;
|
||||||
|
T* self_data_copy;
|
||||||
|
/**
|
||||||
|
* Allocate IPC buffer
|
||||||
|
*
|
||||||
|
* The first section is a temporary buffer for storing intermediate allreduce
|
||||||
|
* results, if a particular algorithm requires it. The second section is for
|
||||||
|
* the input to the allreduce. The actual API takes the input pointer as an
|
||||||
|
* argument (that is, they can and usually should be allocated separately).
|
||||||
|
* But since the input pointers and the temporary buffer all require IPC
|
||||||
|
* registration, they are allocated and registered together in the test for
|
||||||
|
* convenience.
|
||||||
|
*/
|
||||||
|
CUDACHECK(
|
||||||
|
cudaMalloc(&buffer, 2 * data_size * sizeof(T) + sizeof(vllm::Signal)));
|
||||||
|
CUDACHECK(
|
||||||
|
cudaMemset(buffer, 0, 2 * data_size * sizeof(T) + sizeof(vllm::Signal)));
|
||||||
|
CUDACHECK(cudaMalloc(&self_data_copy, data_size * sizeof(T)));
|
||||||
|
CUDACHECK(cudaIpcGetMemHandle(&self_data_handle, buffer));
|
||||||
|
|
||||||
|
MPICHECK(MPI_Allgather(&self_data_handle, sizeof(cudaIpcMemHandle_t),
|
||||||
|
MPI_BYTE, data_handles, sizeof(cudaIpcMemHandle_t),
|
||||||
|
MPI_BYTE, MPI_COMM_WORLD));
|
||||||
|
|
||||||
|
void* rank_data;
|
||||||
|
size_t rank_data_sz = 16 * 1024 * 1024;
|
||||||
|
CUDACHECK(cudaMalloc(&rank_data, rank_data_sz));
|
||||||
|
std::vector<int64_t> offsets(nRanks, 0);
|
||||||
|
vllm::CustomAllreduce fa(buffer, rank_data, rank_data_sz, data_handles,
|
||||||
|
offsets, myRank);
|
||||||
|
auto* self_data =
|
||||||
|
reinterpret_cast<T*>(reinterpret_cast<char*>(buffer) +
|
||||||
|
sizeof(vllm::Signal) + data_size * sizeof(T));
|
||||||
|
// hack buffer registration
|
||||||
|
{
|
||||||
|
std::vector<std::string> handles;
|
||||||
|
handles.reserve(nRanks);
|
||||||
|
for (int i = 0; i < nRanks; i++) {
|
||||||
|
char* begin = (char*)&data_handles[i];
|
||||||
|
char* end = (char*)&data_handles[i + 1];
|
||||||
|
handles.emplace_back(begin, end);
|
||||||
|
}
|
||||||
|
std::vector<int64_t> offsets(nRanks,
|
||||||
|
sizeof(vllm::Signal) + data_size * sizeof(T));
|
||||||
|
fa.register_buffer(handles, offsets, self_data);
|
||||||
|
}
|
||||||
|
|
||||||
|
double* ground_truth;
|
||||||
|
CUDACHECK(cudaMallocHost(&ground_truth, data_size * sizeof(double)));
|
||||||
|
curandState_t* states;
|
||||||
|
CUDACHECK(cudaMalloc(&states, sizeof(curandState_t) * nRanks * data_size));
|
||||||
|
init_rand<<<108, 1024, 0, stream>>>(states, data_size, nRanks);
|
||||||
|
gen_data<T><<<108, 1024, 0, stream>>>(states, self_data, ground_truth, myRank,
|
||||||
|
nRanks, data_size);
|
||||||
|
CUDACHECK(cudaMemcpyAsync(self_data_copy, self_data, data_size * sizeof(T),
|
||||||
|
cudaMemcpyDeviceToDevice, stream));
|
||||||
|
cudaEvent_t start, stop;
|
||||||
|
CUDACHECK(cudaEventCreate(&start));
|
||||||
|
CUDACHECK(cudaEventCreate(&stop));
|
||||||
|
|
||||||
|
ncclDataType_t ncclDtype;
|
||||||
|
if (std::is_same<T, half>::value) {
|
||||||
|
ncclDtype = ncclFloat16;
|
||||||
|
} else if (std::is_same<T, nv_bfloat16>::value) {
|
||||||
|
ncclDtype = ncclBfloat16;
|
||||||
|
} else {
|
||||||
|
ncclDtype = ncclFloat;
|
||||||
|
}
|
||||||
|
double *nccl_result, *my_result;
|
||||||
|
CUDACHECK(cudaMallocHost(&nccl_result, data_size * sizeof(double)));
|
||||||
|
CUDACHECK(cudaMallocHost(&my_result, data_size * sizeof(double)));
|
||||||
|
if (performance_test) {
|
||||||
|
dummy_kernel<<<1, 1, 0, stream>>>();
|
||||||
|
constexpr int warmup_iters = 5;
|
||||||
|
constexpr int num_iters = 100;
|
||||||
|
// warmup
|
||||||
|
for (int i = 0; i < warmup_iters; i++) {
|
||||||
|
NCCLCHECK(ncclAllReduce(result, result, data_size, ncclDtype, ncclSum,
|
||||||
|
comm, stream));
|
||||||
|
}
|
||||||
|
CUDACHECK(cudaEventRecord(start, stream));
|
||||||
|
for (int i = 0; i < num_iters; i++) {
|
||||||
|
NCCLCHECK(ncclAllReduce(result, result, data_size, ncclDtype, ncclSum,
|
||||||
|
comm, stream));
|
||||||
|
}
|
||||||
|
CUDACHECK(cudaEventRecord(stop, stream));
|
||||||
|
CUDACHECK(cudaStreamSynchronize(stream));
|
||||||
|
float allreduce_ms = 0;
|
||||||
|
cudaEventElapsedTime(&allreduce_ms, start, stop);
|
||||||
|
|
||||||
|
dummy_kernel<<<1, 1, 0, stream>>>();
|
||||||
|
// warm up
|
||||||
|
for (int i = 0; i < warmup_iters; i++) {
|
||||||
|
fa.allreduce<T>(stream, self_data, result, data_size, threads,
|
||||||
|
block_limit);
|
||||||
|
}
|
||||||
|
CUDACHECK(cudaEventRecord(start, stream));
|
||||||
|
for (int i = 0; i < num_iters; i++) {
|
||||||
|
fa.allreduce<T>(stream, self_data, result, data_size, threads,
|
||||||
|
block_limit);
|
||||||
|
}
|
||||||
|
CUDACHECK(cudaEventRecord(stop, stream));
|
||||||
|
CUDACHECK(cudaStreamSynchronize(stream));
|
||||||
|
|
||||||
|
float duration_ms = 0;
|
||||||
|
cudaEventElapsedTime(&duration_ms, start, stop);
|
||||||
|
if (myRank == 0)
|
||||||
|
printf(
|
||||||
|
"Rank %d done, nGPUs:%d, sz (kb): %d, %d, %d, my time:%.2fus, nccl "
|
||||||
|
"time:%.2fus\n",
|
||||||
|
myRank, nRanks, data_size * sizeof(T) / 1024, threads, block_limit,
|
||||||
|
duration_ms * 1e3 / num_iters, allreduce_ms * 1e3 / num_iters);
|
||||||
|
|
||||||
|
// And wait for all the queued up work to complete
|
||||||
|
CUDACHECK(cudaStreamSynchronize(stream));
|
||||||
|
|
||||||
|
NCCLCHECK(ncclAllReduce(self_data_copy, self_data, data_size, ncclDtype,
|
||||||
|
ncclSum, comm, stream));
|
||||||
|
|
||||||
|
convert_data<T><<<108, 1024, 0, stream>>>(self_data, result, nccl_result,
|
||||||
|
my_result, data_size);
|
||||||
|
CUDACHECK(cudaStreamSynchronize(stream));
|
||||||
|
|
||||||
|
for (unsigned long j = 0; j < data_size; j++) {
|
||||||
|
auto diff = abs(nccl_result[j] - my_result[j]);
|
||||||
|
if (diff >= 4e-2) {
|
||||||
|
printf("Rank %d: Verification mismatch at %lld: %f != (my) %f, gt=%f\n",
|
||||||
|
myRank, j, nccl_result[j], my_result[j], ground_truth[j]);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
long double nccl_diffs = 0.0;
|
||||||
|
long double my_diffs = 0.0;
|
||||||
|
for (int j = 0; j < data_size; j++) {
|
||||||
|
nccl_diffs += abs(nccl_result[j] - ground_truth[j]);
|
||||||
|
my_diffs += abs(my_result[j] - ground_truth[j]);
|
||||||
|
}
|
||||||
|
if (myRank == 0)
|
||||||
|
std::cout << "average abs diffs: nccl: " << nccl_diffs / data_size
|
||||||
|
<< " me: " << my_diffs / data_size << std::endl;
|
||||||
|
} else {
|
||||||
|
for (int i = 0; i < 100; i++) {
|
||||||
|
fa.allreduce<T>(stream, self_data, result, data_size, threads,
|
||||||
|
block_limit);
|
||||||
|
CUDACHECK(cudaStreamSynchronize(stream));
|
||||||
|
NCCLCHECK(ncclAllReduce(self_data, self_data_copy, data_size, ncclDtype,
|
||||||
|
ncclSum, comm, stream));
|
||||||
|
convert_data<T><<<108, 1024, 0, stream>>>(
|
||||||
|
self_data_copy, result, nccl_result, my_result, data_size);
|
||||||
|
CUDACHECK(cudaStreamSynchronize(stream));
|
||||||
|
|
||||||
|
for (unsigned long j = 0; j < data_size; j++) {
|
||||||
|
auto diff = abs(nccl_result[j] - my_result[j]);
|
||||||
|
if (diff >= 4e-2) {
|
||||||
|
printf(
|
||||||
|
"Rank %d: Verification mismatch at %lld: %f != (my) %f, gt=%f\n",
|
||||||
|
myRank, j, nccl_result[j], my_result[j], ground_truth[j]);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (myRank == 0)
|
||||||
|
printf("Test passed: nGPUs:%d, sz (kb): %d, %d, %d\n", nRanks,
|
||||||
|
data_size * sizeof(T) / 1024, threads, block_limit);
|
||||||
|
// long double nccl_diffs = 0.0;
|
||||||
|
// long double my_diffs = 0.0;
|
||||||
|
// for (int j = 0; j < data_size; j++) {
|
||||||
|
// nccl_diffs += abs(nccl_result[j] - ground_truth[j]);
|
||||||
|
// my_diffs += abs(my_result[j] - ground_truth[j]);
|
||||||
|
// }
|
||||||
|
// if (myRank == 0)
|
||||||
|
// std::cout << "average abs diffs: nccl: " << nccl_diffs / data_size
|
||||||
|
// << " me: " << my_diffs / data_size << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
CUDACHECK(cudaFree(result));
|
||||||
|
CUDACHECK(cudaFree(self_data_copy));
|
||||||
|
CUDACHECK(cudaFree(rank_data));
|
||||||
|
CUDACHECK(cudaFree(buffer));
|
||||||
|
CUDACHECK(cudaFree(states));
|
||||||
|
CUDACHECK(cudaFreeHost(ground_truth));
|
||||||
|
CUDACHECK(cudaFreeHost(nccl_result));
|
||||||
|
CUDACHECK(cudaFreeHost(my_result));
|
||||||
|
CUDACHECK(cudaStreamDestroy(stream));
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char** argv) {
|
||||||
|
int nRanks, myRank;
|
||||||
|
MPICHECK(MPI_Init(&argc, &argv));
|
||||||
|
MPICHECK(MPI_Comm_rank(MPI_COMM_WORLD, &myRank));
|
||||||
|
MPICHECK(MPI_Comm_size(MPI_COMM_WORLD, &nRanks));
|
||||||
|
CUDACHECK(cudaSetDevice(myRank));
|
||||||
|
ncclUniqueId id;
|
||||||
|
ncclComm_t comm;
|
||||||
|
if (myRank == 0) ncclGetUniqueId(&id);
|
||||||
|
MPICHECK(MPI_Bcast(static_cast<void*>(&id), sizeof(id), MPI_BYTE, 0,
|
||||||
|
MPI_COMM_WORLD));
|
||||||
|
NCCLCHECK(ncclCommInitRank(&comm, nRanks, id, myRank));
|
||||||
|
|
||||||
|
bool performance_test = true;
|
||||||
|
cudaProfilerStart();
|
||||||
|
// for (int threads : {256, 512}) {
|
||||||
|
// for (int block_limit = 16; block_limit < 112; block_limit += 4) {
|
||||||
|
// run<half>(myRank, nRanks, comm, threads, block_limit, 4096 * 1024);
|
||||||
|
// }
|
||||||
|
// }
|
||||||
|
for (int sz = 512; sz <= (8 << 20); sz *= 2) {
|
||||||
|
run<half>(myRank, nRanks, comm, 512, 36, sz + 8 * 47, performance_test);
|
||||||
|
}
|
||||||
|
|
||||||
|
cudaProfilerStop();
|
||||||
|
return EXIT_SUCCESS;
|
||||||
|
}
|
||||||
@ -2,13 +2,34 @@
|
|||||||
* Adapted from
|
* Adapted from
|
||||||
* https://github.com/pytorch/pytorch/blob/v2.0.1/aten/src/ATen/Dispatch.h
|
* https://github.com/pytorch/pytorch/blob/v2.0.1/aten/src/ATen/Dispatch.h
|
||||||
*/
|
*/
|
||||||
#include <torch/extension.h>
|
#pragma once
|
||||||
|
|
||||||
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
|
#include <torch/all.h>
|
||||||
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
|
||||||
AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
|
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
|
||||||
|
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
||||||
|
AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
|
||||||
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
|
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
|
||||||
|
|
||||||
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
|
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
|
||||||
AT_DISPATCH_SWITCH( \
|
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
|
||||||
TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
|
|
||||||
|
#define VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(...) \
|
||||||
|
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
||||||
|
AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
|
||||||
|
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
|
||||||
|
AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define VLLM_DISPATCH_FLOATING_AND_BYTE_TYPES(TYPE, NAME, ...) \
|
||||||
|
AT_DISPATCH_SWITCH(TYPE, NAME, \
|
||||||
|
VLLM_DISPATCH_CASE_FLOATING_AND_BYTE_TYPES(__VA_ARGS__))
|
||||||
|
|
||||||
|
#define VLLM_DISPATCH_CASE_INTEGRAL_TYPES(...) \
|
||||||
|
AT_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__) \
|
||||||
|
AT_DISPATCH_CASE(at::ScalarType::Char, __VA_ARGS__) \
|
||||||
|
AT_DISPATCH_CASE(at::ScalarType::Short, __VA_ARGS__) \
|
||||||
|
AT_DISPATCH_CASE(at::ScalarType::Int, __VA_ARGS__) \
|
||||||
|
AT_DISPATCH_CASE(at::ScalarType::Long, __VA_ARGS__)
|
||||||
|
|
||||||
|
#define VLLM_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...) \
|
||||||
|
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_INTEGRAL_TYPES(__VA_ARGS__))
|
||||||
|
|||||||
@ -1,24 +0,0 @@
|
|||||||
#include <torch/extension.h>
|
|
||||||
|
|
||||||
void rms_norm(
|
|
||||||
torch::Tensor& out,
|
|
||||||
torch::Tensor& input,
|
|
||||||
torch::Tensor& weight,
|
|
||||||
float epsilon);
|
|
||||||
|
|
||||||
void fused_add_rms_norm(
|
|
||||||
torch::Tensor& input,
|
|
||||||
torch::Tensor& residual,
|
|
||||||
torch::Tensor& weight,
|
|
||||||
float epsilon);
|
|
||||||
|
|
||||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
|
||||||
m.def(
|
|
||||||
"rms_norm",
|
|
||||||
&rms_norm,
|
|
||||||
"Apply Root Mean Square (RMS) Normalization to the input tensor.");
|
|
||||||
m.def(
|
|
||||||
"fused_add_rms_norm",
|
|
||||||
&fused_add_rms_norm,
|
|
||||||
"In-place fused Add and RMS Normalization");
|
|
||||||
}
|
|
||||||
@ -1,25 +1,34 @@
|
|||||||
#include <torch/extension.h>
|
#include <torch/all.h>
|
||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
|
||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
#include "reduction_utils.cuh"
|
#include "reduction_utils.cuh"
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
#include <cuda_bf16.h>
|
||||||
|
#include <cuda_fp16.h>
|
||||||
|
#else
|
||||||
|
#include <hip/hip_bf16.h>
|
||||||
|
#include <hip/hip_fp16.h>
|
||||||
|
|
||||||
|
using __nv_bfloat16 = __hip_bfloat16;
|
||||||
|
using __nv_bfloat162 = __hip_bfloat162;
|
||||||
|
#endif
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
// TODO(woosuk): Further optimize this kernel.
|
// TODO(woosuk): Further optimize this kernel.
|
||||||
template<typename scalar_t>
|
template <typename scalar_t>
|
||||||
__global__ void rms_norm_kernel(
|
__global__ void rms_norm_kernel(
|
||||||
scalar_t* __restrict__ out, // [..., hidden_size]
|
scalar_t* __restrict__ out, // [..., hidden_size]
|
||||||
const scalar_t* __restrict__ input, // [..., hidden_size]
|
const scalar_t* __restrict__ input, // [..., hidden_size]
|
||||||
const scalar_t* __restrict__ weight, // [hidden_size]
|
const scalar_t* __restrict__ weight, // [hidden_size]
|
||||||
const float epsilon,
|
const float epsilon, const int num_tokens, const int hidden_size) {
|
||||||
const int num_tokens,
|
|
||||||
const int hidden_size) {
|
|
||||||
__shared__ float s_variance;
|
__shared__ float s_variance;
|
||||||
float variance = 0.0f;
|
float variance = 0.0f;
|
||||||
|
|
||||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||||
const float x = (float) input[blockIdx.x * hidden_size + idx];
|
const float x = (float)input[blockIdx.x * hidden_size + idx];
|
||||||
variance += x * x;
|
variance += x * x;
|
||||||
}
|
}
|
||||||
variance = blockReduceSum<float>(variance);
|
variance = blockReduceSum<float>(variance);
|
||||||
@ -29,89 +38,315 @@ __global__ void rms_norm_kernel(
|
|||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||||
float x = (float) input[blockIdx.x * hidden_size + idx];
|
float x = (float)input[blockIdx.x * hidden_size + idx];
|
||||||
out[blockIdx.x * hidden_size + idx] = ((scalar_t) (x * s_variance)) * weight[idx];
|
out[blockIdx.x * hidden_size + idx] =
|
||||||
|
((scalar_t)(x * s_variance)) * weight[idx];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Further optimize this kernel.
|
/* Converter structs for the conversion from torch types to HIP/CUDA types,
|
||||||
template<typename scalar_t>
|
and the associated type conversions within HIP/CUDA. These helpers need
|
||||||
__global__ void fused_add_rms_norm_kernel(
|
to be implemented for now because the relevant type conversion
|
||||||
scalar_t* __restrict__ input, // [..., hidden_size]
|
operators/constructors are not consistently implemented by HIP/CUDA, so
|
||||||
scalar_t* __restrict__ residual, // [..., hidden_size]
|
a generic conversion via type casts cannot be implemented.
|
||||||
const scalar_t* __restrict__ weight, // [hidden_size]
|
|
||||||
const float epsilon,
|
Each struct should have the member static constexpr bool `exists`:
|
||||||
const int num_tokens,
|
If false, the optimized kernel is not used for the corresponding torch type.
|
||||||
const int hidden_size) {
|
If true, the struct should be fully defined as shown in the examples below.
|
||||||
|
*/
|
||||||
|
template <typename torch_type>
|
||||||
|
struct _typeConvert {
|
||||||
|
static constexpr bool exists = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
#if defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000))
|
||||||
|
// CUDA < 12.0 runs into issues with packed type conversion
|
||||||
|
template <>
|
||||||
|
struct _typeConvert<c10::Half> {
|
||||||
|
static constexpr bool exists = true;
|
||||||
|
using hip_type = __half;
|
||||||
|
using packed_hip_type = __half2;
|
||||||
|
|
||||||
|
__device__ static inline float convert(hip_type x) { return __half2float(x); }
|
||||||
|
__device__ static inline float2 convert(packed_hip_type x) {
|
||||||
|
return __half22float2(x);
|
||||||
|
}
|
||||||
|
__device__ static inline hip_type convert(float x) {
|
||||||
|
return __float2half_rn(x);
|
||||||
|
}
|
||||||
|
__device__ static inline packed_hip_type convert(float2 x) {
|
||||||
|
return __float22half2_rn(x);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
|
||||||
|
// CUDA_ARCH < 800 does not have BF16 support
|
||||||
|
// TODO: Add in ROCm support once public headers handle bf16 maturely
|
||||||
|
template <>
|
||||||
|
struct _typeConvert<c10::BFloat16> {
|
||||||
|
static constexpr bool exists = true;
|
||||||
|
using hip_type = __nv_bfloat16;
|
||||||
|
using packed_hip_type = __nv_bfloat162;
|
||||||
|
|
||||||
|
__device__ static inline float convert(hip_type x) {
|
||||||
|
return __bfloat162float(x);
|
||||||
|
}
|
||||||
|
__device__ static inline float2 convert(packed_hip_type x) {
|
||||||
|
return __bfloat1622float2(x);
|
||||||
|
}
|
||||||
|
__device__ static inline hip_type convert(float x) {
|
||||||
|
return __float2bfloat16(x);
|
||||||
|
}
|
||||||
|
__device__ static inline packed_hip_type convert(float2 x) {
|
||||||
|
return __float22bfloat162_rn(x);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
#endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
|
||||||
|
#endif // defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >=
|
||||||
|
// 12000))
|
||||||
|
|
||||||
|
/* Vector POD struct to generate vectorized and packed FP16/BF16 ops
|
||||||
|
for appropriate specializations of fused_add_rms_norm_kernel.
|
||||||
|
Only functions that are necessary in that kernel are implemented.
|
||||||
|
Alignment to 16 bytes is required to use 128-bit global memory ops.
|
||||||
|
*/
|
||||||
|
template <typename scalar_t, int width>
|
||||||
|
struct alignas(16) _f16Vec {
|
||||||
|
/* Not theoretically necessary that width is a power of 2 but should
|
||||||
|
almost always be the case for optimization purposes */
|
||||||
|
static_assert(width > 0 && (width & (width - 1)) == 0,
|
||||||
|
"Width is not a positive power of 2!");
|
||||||
|
using Converter = _typeConvert<scalar_t>;
|
||||||
|
using T1 = typename Converter::hip_type;
|
||||||
|
using T2 = typename Converter::packed_hip_type;
|
||||||
|
T1 data[width];
|
||||||
|
|
||||||
|
__device__ _f16Vec& operator+=(const _f16Vec<scalar_t, width>& other) {
|
||||||
|
if constexpr (width % 2 == 0) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < width; i += 2) {
|
||||||
|
T2 temp{data[i], data[i + 1]};
|
||||||
|
temp += T2{other.data[i], other.data[i + 1]};
|
||||||
|
data[i] = temp.x;
|
||||||
|
data[i + 1] = temp.y;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < width; ++i) data[i] += other.data[i];
|
||||||
|
}
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ _f16Vec& operator*=(const _f16Vec<scalar_t, width>& other) {
|
||||||
|
if constexpr (width % 2 == 0) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < width; i += 2) {
|
||||||
|
T2 temp{data[i], data[i + 1]};
|
||||||
|
temp *= T2{other.data[i], other.data[i + 1]};
|
||||||
|
data[i] = temp.x;
|
||||||
|
data[i + 1] = temp.y;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < width; ++i) data[i] *= other.data[i];
|
||||||
|
}
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ _f16Vec& operator*=(const float scale) {
|
||||||
|
if constexpr (width % 2 == 0) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < width; i += 2) {
|
||||||
|
float2 temp_f = Converter::convert(T2{data[i], data[i + 1]});
|
||||||
|
temp_f.x *= scale;
|
||||||
|
temp_f.y *= scale;
|
||||||
|
T2 temp = Converter::convert(temp_f);
|
||||||
|
data[i] = temp.x;
|
||||||
|
data[i + 1] = temp.y;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < width; ++i) {
|
||||||
|
float temp = Converter::convert(data[i]) * scale;
|
||||||
|
data[i] = Converter::convert(temp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return *this;
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ float sum_squares() const {
|
||||||
|
float result = 0.0f;
|
||||||
|
if constexpr (width % 2 == 0) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < width; i += 2) {
|
||||||
|
float2 z = Converter::convert(T2{data[i], data[i + 1]});
|
||||||
|
result += z.x * z.x + z.y * z.y;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
#pragma unroll
|
||||||
|
for (int i = 0; i < width; ++i) {
|
||||||
|
float x = Converter::convert(data[i]);
|
||||||
|
result += x * x;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Function specialization in the case of FP16/BF16 tensors.
|
||||||
|
Additional optimizations we can make in this case are
|
||||||
|
packed and vectorized operations, which help with the
|
||||||
|
memory latency bottleneck. */
|
||||||
|
template <typename scalar_t, int width>
|
||||||
|
__global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
|
||||||
|
fused_add_rms_norm_kernel(
|
||||||
|
scalar_t* __restrict__ input, // [..., hidden_size]
|
||||||
|
scalar_t* __restrict__ residual, // [..., hidden_size]
|
||||||
|
const scalar_t* __restrict__ weight, // [hidden_size]
|
||||||
|
const float epsilon, const int num_tokens, const int hidden_size) {
|
||||||
|
// Sanity checks on our vector struct and type-punned pointer arithmetic
|
||||||
|
static_assert(std::is_pod_v<_f16Vec<scalar_t, width>>);
|
||||||
|
static_assert(sizeof(_f16Vec<scalar_t, width>) == sizeof(scalar_t) * width);
|
||||||
|
|
||||||
|
const int vec_hidden_size = hidden_size / width;
|
||||||
|
__shared__ float s_variance;
|
||||||
|
float variance = 0.0f;
|
||||||
|
/* These and the argument pointers are all declared `restrict` as they are
|
||||||
|
not aliased in practice. Argument pointers should not be dereferenced
|
||||||
|
in this kernel as that would be undefined behavior */
|
||||||
|
auto* __restrict__ input_v =
|
||||||
|
reinterpret_cast<_f16Vec<scalar_t, width>*>(input);
|
||||||
|
auto* __restrict__ residual_v =
|
||||||
|
reinterpret_cast<_f16Vec<scalar_t, width>*>(residual);
|
||||||
|
auto* __restrict__ weight_v =
|
||||||
|
reinterpret_cast<const _f16Vec<scalar_t, width>*>(weight);
|
||||||
|
|
||||||
|
for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
|
||||||
|
int id = blockIdx.x * vec_hidden_size + idx;
|
||||||
|
_f16Vec<scalar_t, width> temp = input_v[id];
|
||||||
|
temp += residual_v[id];
|
||||||
|
variance += temp.sum_squares();
|
||||||
|
residual_v[id] = temp;
|
||||||
|
}
|
||||||
|
/* Keep the following if-else block in sync with the
|
||||||
|
calculation of max_block_size in fused_add_rms_norm */
|
||||||
|
if (num_tokens < 256) {
|
||||||
|
variance = blockReduceSum<float, 1024>(variance);
|
||||||
|
} else
|
||||||
|
variance = blockReduceSum<float, 256>(variance);
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
s_variance = rsqrtf(variance / hidden_size + epsilon);
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
for (int idx = threadIdx.x; idx < vec_hidden_size; idx += blockDim.x) {
|
||||||
|
int id = blockIdx.x * vec_hidden_size + idx;
|
||||||
|
_f16Vec<scalar_t, width> temp = residual_v[id];
|
||||||
|
temp *= s_variance;
|
||||||
|
temp *= weight_v[idx];
|
||||||
|
input_v[id] = temp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Generic fused_add_rms_norm_kernel
|
||||||
|
The width field is not used here but necessary for other specializations.
|
||||||
|
*/
|
||||||
|
template <typename scalar_t, int width>
|
||||||
|
__global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
|
||||||
|
fused_add_rms_norm_kernel(
|
||||||
|
scalar_t* __restrict__ input, // [..., hidden_size]
|
||||||
|
scalar_t* __restrict__ residual, // [..., hidden_size]
|
||||||
|
const scalar_t* __restrict__ weight, // [hidden_size]
|
||||||
|
const float epsilon, const int num_tokens, const int hidden_size) {
|
||||||
__shared__ float s_variance;
|
__shared__ float s_variance;
|
||||||
float variance = 0.0f;
|
float variance = 0.0f;
|
||||||
|
|
||||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||||
float x = (float) input[blockIdx.x * hidden_size + idx];
|
scalar_t z = input[blockIdx.x * hidden_size + idx];
|
||||||
x += (float) residual[blockIdx.x * hidden_size + idx];
|
z += residual[blockIdx.x * hidden_size + idx];
|
||||||
|
float x = (float)z;
|
||||||
variance += x * x;
|
variance += x * x;
|
||||||
residual[blockIdx.x * hidden_size + idx] = (scalar_t) x;
|
residual[blockIdx.x * hidden_size + idx] = z;
|
||||||
}
|
}
|
||||||
variance = blockReduceSum<float>(variance);
|
/* Keep the following if-else block in sync with the
|
||||||
|
calculation of max_block_size in fused_add_rms_norm */
|
||||||
|
if (num_tokens < 256) {
|
||||||
|
variance = blockReduceSum<float, 1024>(variance);
|
||||||
|
} else
|
||||||
|
variance = blockReduceSum<float, 256>(variance);
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
s_variance = rsqrtf(variance / hidden_size + epsilon);
|
s_variance = rsqrtf(variance / hidden_size + epsilon);
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
|
||||||
float x = (float) residual[blockIdx.x * hidden_size + idx];
|
float x = (float)residual[blockIdx.x * hidden_size + idx];
|
||||||
input[blockIdx.x * hidden_size + idx] = ((scalar_t) (x * s_variance)) * weight[idx];
|
input[blockIdx.x * hidden_size + idx] =
|
||||||
|
((scalar_t)(x * s_variance)) * weight[idx];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace vllm
|
} // namespace vllm
|
||||||
|
|
||||||
void rms_norm(
|
void rms_norm(torch::Tensor& out, // [..., hidden_size]
|
||||||
torch::Tensor& out, // [..., hidden_size]
|
torch::Tensor& input, // [..., hidden_size]
|
||||||
torch::Tensor& input, // [..., hidden_size]
|
torch::Tensor& weight, // [hidden_size]
|
||||||
torch::Tensor& weight, // [hidden_size]
|
double epsilon) {
|
||||||
float epsilon) {
|
|
||||||
int hidden_size = input.size(-1);
|
int hidden_size = input.size(-1);
|
||||||
int num_tokens = input.numel() / hidden_size;
|
int num_tokens = input.numel() / hidden_size;
|
||||||
|
|
||||||
dim3 grid(num_tokens);
|
dim3 grid(num_tokens);
|
||||||
dim3 block(std::min(hidden_size, 1024));
|
dim3 block(std::min(hidden_size, 1024));
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
VLLM_DISPATCH_FLOATING_TYPES(
|
VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
|
||||||
input.scalar_type(),
|
vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
||||||
"rms_norm_kernel",
|
out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
|
||||||
[&] {
|
weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
|
||||||
vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
});
|
||||||
out.data_ptr<scalar_t>(),
|
|
||||||
input.data_ptr<scalar_t>(),
|
|
||||||
weight.data_ptr<scalar_t>(),
|
|
||||||
epsilon,
|
|
||||||
num_tokens,
|
|
||||||
hidden_size);
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void fused_add_rms_norm(
|
#define LAUNCH_FUSED_ADD_RMS_NORM(width) \
|
||||||
torch::Tensor& input, // [..., hidden_size]
|
VLLM_DISPATCH_FLOATING_TYPES( \
|
||||||
torch::Tensor& residual, // [..., hidden_size]
|
input.scalar_type(), "fused_add_rms_norm_kernel", [&] { \
|
||||||
torch::Tensor& weight, // [hidden_size]
|
vllm::fused_add_rms_norm_kernel<scalar_t, width> \
|
||||||
float epsilon) {
|
<<<grid, block, 0, stream>>>(input.data_ptr<scalar_t>(), \
|
||||||
|
residual.data_ptr<scalar_t>(), \
|
||||||
|
weight.data_ptr<scalar_t>(), epsilon, \
|
||||||
|
num_tokens, hidden_size); \
|
||||||
|
});
|
||||||
|
|
||||||
|
void fused_add_rms_norm(torch::Tensor& input, // [..., hidden_size]
|
||||||
|
torch::Tensor& residual, // [..., hidden_size]
|
||||||
|
torch::Tensor& weight, // [hidden_size]
|
||||||
|
double epsilon) {
|
||||||
int hidden_size = input.size(-1);
|
int hidden_size = input.size(-1);
|
||||||
int num_tokens = input.numel() / hidden_size;
|
int num_tokens = input.numel() / hidden_size;
|
||||||
|
|
||||||
dim3 grid(num_tokens);
|
dim3 grid(num_tokens);
|
||||||
dim3 block(std::min(hidden_size, 1024));
|
/* This kernel is memory-latency bound in many scenarios.
|
||||||
|
When num_tokens is large, a smaller block size allows
|
||||||
|
for increased block occupancy on CUs and better latency
|
||||||
|
hiding on global mem ops. */
|
||||||
|
const int max_block_size = (num_tokens < 256) ? 1024 : 256;
|
||||||
|
dim3 block(std::min(hidden_size, max_block_size));
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
VLLM_DISPATCH_FLOATING_TYPES(
|
/*If the tensor types are FP16/BF16, try to use the optimized kernel
|
||||||
input.scalar_type(),
|
with packed + vectorized ops.
|
||||||
"fused_add_rms_norm_kernel",
|
Max optimization is achieved with a width-8 vector of FP16/BF16s
|
||||||
[&] {
|
since we can load at most 128 bits at once in a global memory op.
|
||||||
vllm::fused_add_rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
|
However, this requires each tensor's data to be aligned to 16
|
||||||
input.data_ptr<scalar_t>(),
|
bytes.
|
||||||
residual.data_ptr<scalar_t>(),
|
*/
|
||||||
weight.data_ptr<scalar_t>(),
|
auto inp_ptr = reinterpret_cast<std::uintptr_t>(input.data_ptr());
|
||||||
epsilon,
|
auto res_ptr = reinterpret_cast<std::uintptr_t>(residual.data_ptr());
|
||||||
num_tokens,
|
auto wt_ptr = reinterpret_cast<std::uintptr_t>(weight.data_ptr());
|
||||||
hidden_size);
|
bool ptrs_are_aligned =
|
||||||
});
|
inp_ptr % 16 == 0 && res_ptr % 16 == 0 && wt_ptr % 16 == 0;
|
||||||
|
if (ptrs_are_aligned && hidden_size % 8 == 0) {
|
||||||
|
LAUNCH_FUSED_ADD_RMS_NORM(8);
|
||||||
|
} else {
|
||||||
|
LAUNCH_FUSED_ADD_RMS_NORM(0);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
7
csrc/moe/moe_ops.h
Normal file
7
csrc/moe/moe_ops.h
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <torch/all.h>
|
||||||
|
|
||||||
|
void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
|
||||||
|
torch::Tensor& token_expert_indices,
|
||||||
|
torch::Tensor& gating_output);
|
||||||
506
csrc/moe/topk_softmax_kernels.cu
Normal file
506
csrc/moe/topk_softmax_kernels.cu
Normal file
@ -0,0 +1,506 @@
|
|||||||
|
/*
|
||||||
|
* Adapted from https://github.com/NVIDIA/TensorRT-LLM/blob/v0.7.1/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu
|
||||||
|
* Copyright (c) 2024, The vLLM team.
|
||||||
|
* SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||||
|
* SPDX-License-Identifier: Apache-2.0
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
#include <torch/all.h>
|
||||||
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
#include "../cuda_compat.h"
|
||||||
|
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
#include <cub/util_type.cuh>
|
||||||
|
#include <cub/cub.cuh>
|
||||||
|
#else
|
||||||
|
#include <hipcub/util_type.hpp>
|
||||||
|
#include <hipcub/hipcub.hpp>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||||
|
|
||||||
|
namespace vllm {
|
||||||
|
namespace moe {
|
||||||
|
|
||||||
|
/// Aligned array type
|
||||||
|
template <
|
||||||
|
typename T,
|
||||||
|
/// Number of elements in the array
|
||||||
|
int N,
|
||||||
|
/// Alignment requirement in bytes
|
||||||
|
int Alignment = sizeof(T) * N
|
||||||
|
>
|
||||||
|
class alignas(Alignment) AlignedArray {
|
||||||
|
float data[N];
|
||||||
|
};
|
||||||
|
|
||||||
|
// ====================== Softmax things ===============================
|
||||||
|
// We have our own implementation of softmax here so we can support transposing the output
|
||||||
|
// in the softmax kernel when we extend this module to support expert-choice routing.
|
||||||
|
template <int TPB>
|
||||||
|
__launch_bounds__(TPB) __global__
|
||||||
|
void moeSoftmax(const float* input, const bool* finished, float* output, const int num_cols)
|
||||||
|
{
|
||||||
|
using BlockReduce = cub::BlockReduce<float, TPB>;
|
||||||
|
__shared__ typename BlockReduce::TempStorage tmpStorage;
|
||||||
|
|
||||||
|
__shared__ float normalizing_factor;
|
||||||
|
__shared__ float float_max;
|
||||||
|
|
||||||
|
const int thread_row_offset = blockIdx.x * num_cols;
|
||||||
|
|
||||||
|
cub::Sum sum;
|
||||||
|
float threadData(-FLT_MAX);
|
||||||
|
|
||||||
|
// Don't touch finished rows.
|
||||||
|
if ((finished != nullptr) && finished[blockIdx.x])
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
|
||||||
|
{
|
||||||
|
const int idx = thread_row_offset + ii;
|
||||||
|
threadData = max(static_cast<float>(input[idx]), threadData);
|
||||||
|
}
|
||||||
|
|
||||||
|
const float maxElem = BlockReduce(tmpStorage).Reduce(threadData, cub::Max());
|
||||||
|
if (threadIdx.x == 0)
|
||||||
|
{
|
||||||
|
float_max = maxElem;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
threadData = 0;
|
||||||
|
|
||||||
|
for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
|
||||||
|
{
|
||||||
|
const int idx = thread_row_offset + ii;
|
||||||
|
threadData += exp((static_cast<float>(input[idx]) - float_max));
|
||||||
|
}
|
||||||
|
|
||||||
|
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, sum);
|
||||||
|
|
||||||
|
if (threadIdx.x == 0)
|
||||||
|
{
|
||||||
|
normalizing_factor = 1.f / Z;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
for (int ii = threadIdx.x; ii < num_cols; ii += TPB)
|
||||||
|
{
|
||||||
|
const int idx = thread_row_offset + ii;
|
||||||
|
const float val = exp((static_cast<float>(input[idx]) - float_max)) * normalizing_factor;
|
||||||
|
output[idx] = val;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <int TPB>
|
||||||
|
__launch_bounds__(TPB) __global__ void moeTopK(const float* inputs_after_softmax, const bool* finished, float* output,
|
||||||
|
int* indices, int* source_rows, const int num_experts, const int k, const int start_expert, const int end_expert)
|
||||||
|
{
|
||||||
|
|
||||||
|
using cub_kvp = cub::KeyValuePair<int, float>;
|
||||||
|
using BlockReduce = cub::BlockReduce<cub_kvp, TPB>;
|
||||||
|
__shared__ typename BlockReduce::TempStorage tmpStorage;
|
||||||
|
|
||||||
|
cub_kvp thread_kvp;
|
||||||
|
cub::ArgMax arg_max;
|
||||||
|
|
||||||
|
const int num_rows = gridDim.x;
|
||||||
|
const int block_row = blockIdx.x;
|
||||||
|
|
||||||
|
const bool row_is_active = finished ? !finished[block_row] : true;
|
||||||
|
const int thread_read_offset = blockIdx.x * num_experts;
|
||||||
|
for (int k_idx = 0; k_idx < k; ++k_idx)
|
||||||
|
{
|
||||||
|
thread_kvp.key = 0;
|
||||||
|
thread_kvp.value = -1.f; // This is OK because inputs are probabilities
|
||||||
|
|
||||||
|
cub_kvp inp_kvp;
|
||||||
|
for (int expert = threadIdx.x; expert < num_experts; expert += TPB)
|
||||||
|
{
|
||||||
|
const int idx = thread_read_offset + expert;
|
||||||
|
inp_kvp.key = expert;
|
||||||
|
inp_kvp.value = inputs_after_softmax[idx];
|
||||||
|
|
||||||
|
for (int prior_k = 0; prior_k < k_idx; ++prior_k)
|
||||||
|
{
|
||||||
|
const int prior_winning_expert = indices[k * block_row + prior_k];
|
||||||
|
|
||||||
|
if (prior_winning_expert == expert)
|
||||||
|
{
|
||||||
|
inp_kvp = thread_kvp;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
thread_kvp = arg_max(inp_kvp, thread_kvp);
|
||||||
|
}
|
||||||
|
|
||||||
|
const cub_kvp result_kvp = BlockReduce(tmpStorage).Reduce(thread_kvp, arg_max);
|
||||||
|
if (threadIdx.x == 0)
|
||||||
|
{
|
||||||
|
// Ignore experts the node isn't responsible for with expert parallelism
|
||||||
|
const int expert = result_kvp.key;
|
||||||
|
const bool node_uses_expert = expert >= start_expert && expert < end_expert;
|
||||||
|
const bool should_process_row = row_is_active && node_uses_expert;
|
||||||
|
|
||||||
|
const int idx = k * block_row + k_idx;
|
||||||
|
output[idx] = result_kvp.value;
|
||||||
|
indices[idx] = should_process_row ? (expert - start_expert) : num_experts;
|
||||||
|
assert(indices[idx] >= 0);
|
||||||
|
source_rows[idx] = k_idx * num_rows + block_row;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ====================== TopK softmax things ===============================
|
||||||
|
|
||||||
|
/*
|
||||||
|
A Top-K gating softmax written to exploit when the number of experts in the MoE layers
|
||||||
|
are a small power of 2. This allows us to cleanly share the rows among the threads in
|
||||||
|
a single warp and eliminate communication between warps (so no need to use shared mem).
|
||||||
|
|
||||||
|
It fuses the softmax, max and argmax into a single kernel.
|
||||||
|
|
||||||
|
Limitations:
|
||||||
|
1) This implementation is intended for when the number of experts is a small power of 2.
|
||||||
|
2) This implementation assumes k is small, but will work for any k.
|
||||||
|
*/
|
||||||
|
|
||||||
|
template <int VPT, int NUM_EXPERTS, int WARPS_PER_CTA, int BYTES_PER_LDG>
|
||||||
|
__launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
|
||||||
|
void topkGatingSoftmax(const float* input, const bool* finished, float* output, const int num_rows, int* indices,
|
||||||
|
int* source_rows, const int k, const int start_expert, const int end_expert)
|
||||||
|
{
|
||||||
|
// We begin by enforcing compile time assertions and setting up compile time constants.
|
||||||
|
static_assert(VPT == (VPT & -VPT), "VPT must be power of 2");
|
||||||
|
static_assert(NUM_EXPERTS == (NUM_EXPERTS & -NUM_EXPERTS), "NUM_EXPERTS must be power of 2");
|
||||||
|
static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG), "BYTES_PER_LDG must be power of 2");
|
||||||
|
static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16");
|
||||||
|
|
||||||
|
// Number of bytes each thread pulls in per load
|
||||||
|
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
|
||||||
|
static constexpr int ELTS_PER_ROW = NUM_EXPERTS;
|
||||||
|
static constexpr int THREADS_PER_ROW = ELTS_PER_ROW / VPT;
|
||||||
|
static constexpr int LDG_PER_THREAD = VPT / ELTS_PER_LDG;
|
||||||
|
|
||||||
|
// Restrictions based on previous section.
|
||||||
|
static_assert(VPT % ELTS_PER_LDG == 0, "The elements per thread must be a multiple of the elements per ldg");
|
||||||
|
static_assert(WARP_SIZE % THREADS_PER_ROW == 0, "The threads per row must cleanly divide the threads per warp");
|
||||||
|
static_assert(THREADS_PER_ROW == (THREADS_PER_ROW & -THREADS_PER_ROW), "THREADS_PER_ROW must be power of 2");
|
||||||
|
static_assert(THREADS_PER_ROW <= WARP_SIZE, "THREADS_PER_ROW can be at most warp size");
|
||||||
|
|
||||||
|
// We have NUM_EXPERTS elements per row. We specialize for small #experts
|
||||||
|
static constexpr int ELTS_PER_WARP = WARP_SIZE * VPT;
|
||||||
|
static constexpr int ROWS_PER_WARP = ELTS_PER_WARP / ELTS_PER_ROW;
|
||||||
|
static constexpr int ROWS_PER_CTA = WARPS_PER_CTA * ROWS_PER_WARP;
|
||||||
|
|
||||||
|
// Restrictions for previous section.
|
||||||
|
static_assert(ELTS_PER_WARP % ELTS_PER_ROW == 0, "The elts per row must cleanly divide the total elt per warp");
|
||||||
|
|
||||||
|
// ===================== From this point, we finally start computing run-time variables. ========================
|
||||||
|
|
||||||
|
// Compute CTA and warp rows. We pack multiple rows into a single warp, and a block contains WARPS_PER_CTA warps.
|
||||||
|
// This, each block processes a chunk of rows. We start by computing the start row for each block.
|
||||||
|
const int cta_base_row = blockIdx.x * ROWS_PER_CTA;
|
||||||
|
|
||||||
|
// Now, using the base row per thread block, we compute the base row per warp.
|
||||||
|
const int warp_base_row = cta_base_row + threadIdx.y * ROWS_PER_WARP;
|
||||||
|
|
||||||
|
// The threads in a warp are split into sub-groups that will work on a row.
|
||||||
|
// We compute row offset for each thread sub-group
|
||||||
|
const int thread_row_in_warp = threadIdx.x / THREADS_PER_ROW;
|
||||||
|
const int thread_row = warp_base_row + thread_row_in_warp;
|
||||||
|
|
||||||
|
// Threads with indices out of bounds should early exit here.
|
||||||
|
if (thread_row >= num_rows)
|
||||||
|
{
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const bool row_is_active = finished ? !finished[thread_row] : true;
|
||||||
|
|
||||||
|
// We finally start setting up the read pointers for each thread. First, each thread jumps to the start of the
|
||||||
|
// row it will read.
|
||||||
|
const float* thread_row_ptr = input + thread_row * ELTS_PER_ROW;
|
||||||
|
|
||||||
|
// Now, we compute the group each thread belong to in order to determine the first column to start loads.
|
||||||
|
const int thread_group_idx = threadIdx.x % THREADS_PER_ROW;
|
||||||
|
const int first_elt_read_by_thread = thread_group_idx * ELTS_PER_LDG;
|
||||||
|
const float* thread_read_ptr = thread_row_ptr + first_elt_read_by_thread;
|
||||||
|
|
||||||
|
// Determine the pointer type to use to read in the data depending on the BYTES_PER_LDG template param. In theory,
|
||||||
|
// this can support all powers of 2 up to 16.
|
||||||
|
// NOTE(woosuk): The original implementation uses CUTLASS aligned array here.
|
||||||
|
// We defined our own aligned array and use it here to avoid the dependency on CUTLASS.
|
||||||
|
using AccessType = AlignedArray<float, ELTS_PER_LDG>;
|
||||||
|
|
||||||
|
// Finally, we pull in the data from global mem
|
||||||
|
float row_chunk[VPT];
|
||||||
|
AccessType* row_chunk_vec_ptr = reinterpret_cast<AccessType*>(&row_chunk);
|
||||||
|
const AccessType* vec_thread_read_ptr = reinterpret_cast<const AccessType*>(thread_read_ptr);
|
||||||
|
#pragma unroll
|
||||||
|
for (int ii = 0; ii < LDG_PER_THREAD; ++ii)
|
||||||
|
{
|
||||||
|
row_chunk_vec_ptr[ii] = vec_thread_read_ptr[ii * THREADS_PER_ROW];
|
||||||
|
}
|
||||||
|
|
||||||
|
// First, we perform a max reduce within the thread. We can do the max in fp16 safely (I think) and just
|
||||||
|
// convert to float afterwards for the exp + sum reduction.
|
||||||
|
float thread_max = row_chunk[0];
|
||||||
|
#pragma unroll
|
||||||
|
for (int ii = 1; ii < VPT; ++ii)
|
||||||
|
{
|
||||||
|
thread_max = max(thread_max, row_chunk[ii]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now, we find the max within the thread group and distribute among the threads. We use a butterfly reduce.
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
|
||||||
|
{
|
||||||
|
thread_max = max(thread_max, VLLM_SHFL_XOR_SYNC_WIDTH(thread_max, mask, THREADS_PER_ROW));
|
||||||
|
}
|
||||||
|
|
||||||
|
// From this point, thread max in all the threads have the max within the row.
|
||||||
|
// Now, we subtract the max from each element in the thread and take the exp. We also compute the thread local sum.
|
||||||
|
float row_sum = 0;
|
||||||
|
#pragma unroll
|
||||||
|
for (int ii = 0; ii < VPT; ++ii)
|
||||||
|
{
|
||||||
|
row_chunk[ii] = expf(row_chunk[ii] - thread_max);
|
||||||
|
row_sum += row_chunk[ii];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now, we perform the sum reduce within each thread group. Similar to the max reduce, we use a bufferfly pattern.
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
|
||||||
|
{
|
||||||
|
row_sum += VLLM_SHFL_XOR_SYNC_WIDTH(row_sum, mask, THREADS_PER_ROW);
|
||||||
|
}
|
||||||
|
|
||||||
|
// From this point, all threads have the max and the sum for their rows in the thread_max and thread_sum variables
|
||||||
|
// respectively. Finally, we can scale the rows for the softmax. Technically, for top-k gating we don't need to
|
||||||
|
// compute the entire softmax row. We can likely look at the maxes and only compute for the top-k values in the row.
|
||||||
|
// However, this kernel will likely not be a bottle neck and it seems better to closer match torch and find the
|
||||||
|
// argmax after computing the softmax.
|
||||||
|
const float reciprocal_row_sum = 1.f / row_sum;
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int ii = 0; ii < VPT; ++ii)
|
||||||
|
{
|
||||||
|
row_chunk[ii] = row_chunk[ii] * reciprocal_row_sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now, softmax_res contains the softmax of the row chunk. Now, I want to find the topk elements in each row, along
|
||||||
|
// with the max index.
|
||||||
|
int start_col = first_elt_read_by_thread;
|
||||||
|
static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW;
|
||||||
|
|
||||||
|
for (int k_idx = 0; k_idx < k; ++k_idx)
|
||||||
|
{
|
||||||
|
// First, each thread does the local argmax
|
||||||
|
float max_val = row_chunk[0];
|
||||||
|
int expert = start_col;
|
||||||
|
#pragma unroll
|
||||||
|
for (int ldg = 0, col = start_col; ldg < LDG_PER_THREAD; ++ldg, col += COLS_PER_GROUP_LDG)
|
||||||
|
{
|
||||||
|
#pragma unroll
|
||||||
|
for (int ii = 0; ii < ELTS_PER_LDG; ++ii)
|
||||||
|
{
|
||||||
|
float val = row_chunk[ldg * ELTS_PER_LDG + ii];
|
||||||
|
|
||||||
|
// No check on the experts here since columns with the smallest index are processed first and only
|
||||||
|
// updated if > (not >=)
|
||||||
|
if (val > max_val)
|
||||||
|
{
|
||||||
|
max_val = val;
|
||||||
|
expert = col + ii;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now, we perform the argmax reduce. We use the butterfly pattern so threads reach consensus about the max.
|
||||||
|
// This will be useful for K > 1 so that the threads can agree on "who" had the max value. That thread can
|
||||||
|
// then blank out their max with -inf and the warp can run more iterations...
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
|
||||||
|
{
|
||||||
|
float other_max = VLLM_SHFL_XOR_SYNC_WIDTH(max_val, mask, THREADS_PER_ROW);
|
||||||
|
int other_expert = VLLM_SHFL_XOR_SYNC_WIDTH(expert, mask, THREADS_PER_ROW);
|
||||||
|
|
||||||
|
// We want lower indices to "win" in every thread so we break ties this way
|
||||||
|
if (other_max > max_val || (other_max == max_val && other_expert < expert))
|
||||||
|
{
|
||||||
|
max_val = other_max;
|
||||||
|
expert = other_expert;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write the max for this k iteration to global memory.
|
||||||
|
if (thread_group_idx == 0)
|
||||||
|
{
|
||||||
|
// Add a guard to ignore experts not included by this node
|
||||||
|
const bool node_uses_expert = expert >= start_expert && expert < end_expert;
|
||||||
|
const bool should_process_row = row_is_active && node_uses_expert;
|
||||||
|
|
||||||
|
// The lead thread from each sub-group will write out the final results to global memory. (This will be a
|
||||||
|
// single) thread per row of the input/output matrices.
|
||||||
|
const int idx = k * thread_row + k_idx;
|
||||||
|
output[idx] = max_val;
|
||||||
|
indices[idx] = should_process_row ? (expert - start_expert) : NUM_EXPERTS;
|
||||||
|
source_rows[idx] = k_idx * num_rows + thread_row;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Finally, we clear the value in the thread with the current max if there is another iteration to run.
|
||||||
|
if (k_idx + 1 < k)
|
||||||
|
{
|
||||||
|
const int ldg_group_for_expert = expert / COLS_PER_GROUP_LDG;
|
||||||
|
const int thread_to_clear_in_group = (expert / ELTS_PER_LDG) % THREADS_PER_ROW;
|
||||||
|
|
||||||
|
// Only the thread in the group which produced the max will reset the "winning" value to -inf.
|
||||||
|
if (thread_group_idx == thread_to_clear_in_group)
|
||||||
|
{
|
||||||
|
const int offset_for_expert = expert % ELTS_PER_LDG;
|
||||||
|
// Safe to set to any negative value since row_chunk values must be between 0 and 1.
|
||||||
|
row_chunk[ldg_group_for_expert * ELTS_PER_LDG + offset_for_expert] = -10000.f;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace detail
|
||||||
|
{
|
||||||
|
// Constructs some constants needed to partition the work across threads at compile time.
|
||||||
|
template <int EXPERTS, int BYTES_PER_LDG>
|
||||||
|
struct TopkConstants
|
||||||
|
{
|
||||||
|
static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
|
||||||
|
static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE) == 0, "");
|
||||||
|
static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE));
|
||||||
|
static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
|
||||||
|
static constexpr int THREADS_PER_ROW = EXPERTS / VPT;
|
||||||
|
static constexpr int ROWS_PER_WARP = WARP_SIZE / THREADS_PER_ROW;
|
||||||
|
};
|
||||||
|
} // namespace detail
|
||||||
|
|
||||||
|
template <int EXPERTS, int WARPS_PER_TB>
|
||||||
|
void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, int* indices,
|
||||||
|
int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
|
||||||
|
{
|
||||||
|
static constexpr std::size_t MAX_BYTES_PER_LDG = 16;
|
||||||
|
|
||||||
|
static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
|
||||||
|
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG>;
|
||||||
|
static constexpr int VPT = Constants::VPT;
|
||||||
|
static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
|
||||||
|
const int num_warps = (num_rows + ROWS_PER_WARP - 1) / ROWS_PER_WARP;
|
||||||
|
const int num_blocks = (num_warps + WARPS_PER_TB - 1) / WARPS_PER_TB;
|
||||||
|
|
||||||
|
dim3 block_dim(WARP_SIZE, WARPS_PER_TB);
|
||||||
|
topkGatingSoftmax<VPT, EXPERTS, WARPS_PER_TB, BYTES_PER_LDG><<<num_blocks, block_dim, 0, stream>>>(
|
||||||
|
input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB) \
|
||||||
|
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB>( \
|
||||||
|
gating_output, nullptr, topk_weights, topk_indicies, \
|
||||||
|
token_expert_indices, num_tokens, topk, 0, num_experts, \
|
||||||
|
stream);
|
||||||
|
|
||||||
|
void topkGatingSoftmaxKernelLauncher(
|
||||||
|
const float* gating_output,
|
||||||
|
float* topk_weights,
|
||||||
|
int* topk_indicies,
|
||||||
|
int* token_expert_indices,
|
||||||
|
float* softmax_workspace,
|
||||||
|
const int num_tokens,
|
||||||
|
const int num_experts,
|
||||||
|
const int topk,
|
||||||
|
cudaStream_t stream) {
|
||||||
|
static constexpr int WARPS_PER_TB = 4;
|
||||||
|
switch (num_experts) {
|
||||||
|
case 1:
|
||||||
|
LAUNCH_SOFTMAX(1, WARPS_PER_TB);
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
LAUNCH_SOFTMAX(2, WARPS_PER_TB);
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
LAUNCH_SOFTMAX(4, WARPS_PER_TB);
|
||||||
|
break;
|
||||||
|
case 8:
|
||||||
|
LAUNCH_SOFTMAX(8, WARPS_PER_TB);
|
||||||
|
break;
|
||||||
|
case 16:
|
||||||
|
LAUNCH_SOFTMAX(16, WARPS_PER_TB);
|
||||||
|
break;
|
||||||
|
case 32:
|
||||||
|
LAUNCH_SOFTMAX(32, WARPS_PER_TB);
|
||||||
|
break;
|
||||||
|
case 64:
|
||||||
|
LAUNCH_SOFTMAX(64, WARPS_PER_TB);
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
LAUNCH_SOFTMAX(128, WARPS_PER_TB);
|
||||||
|
break;
|
||||||
|
case 256:
|
||||||
|
LAUNCH_SOFTMAX(256, WARPS_PER_TB);
|
||||||
|
break;
|
||||||
|
default: {
|
||||||
|
TORCH_CHECK(softmax_workspace != nullptr,
|
||||||
|
"softmax_workspace must be provided for num_experts that are not a power of 2.");
|
||||||
|
static constexpr int TPB = 256;
|
||||||
|
moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
|
||||||
|
gating_output, nullptr, softmax_workspace, num_experts);
|
||||||
|
moeTopK<TPB><<<num_tokens, TPB, 0, stream>>>(
|
||||||
|
softmax_workspace, nullptr, topk_weights, topk_indicies, token_expert_indices,
|
||||||
|
num_experts, topk, 0, num_experts);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace moe
|
||||||
|
} // namespace vllm
|
||||||
|
|
||||||
|
void topk_softmax(
|
||||||
|
torch::Tensor& topk_weights, // [num_tokens, topk]
|
||||||
|
torch::Tensor& topk_indices, // [num_tokens, topk]
|
||||||
|
torch::Tensor& token_expert_indices, // [num_tokens, topk]
|
||||||
|
torch::Tensor& gating_output) // [num_tokens, num_experts]
|
||||||
|
{
|
||||||
|
const int num_experts = gating_output.size(-1);
|
||||||
|
const int num_tokens = gating_output.numel() / num_experts;
|
||||||
|
const int topk = topk_weights.size(-1);
|
||||||
|
|
||||||
|
const bool is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
|
||||||
|
const bool needs_workspace = !is_pow_2 || num_experts > 256;
|
||||||
|
const int64_t workspace_size = needs_workspace ? num_tokens * num_experts : 0;
|
||||||
|
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(gating_output));
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
torch::Tensor softmax_workspace = torch::empty({workspace_size}, gating_output.options());
|
||||||
|
vllm::moe::topkGatingSoftmaxKernelLauncher(
|
||||||
|
gating_output.data_ptr<float>(),
|
||||||
|
topk_weights.data_ptr<float>(),
|
||||||
|
topk_indices.data_ptr<int>(),
|
||||||
|
token_expert_indices.data_ptr<int>(),
|
||||||
|
softmax_workspace.data_ptr<float>(),
|
||||||
|
num_tokens,
|
||||||
|
num_experts,
|
||||||
|
topk,
|
||||||
|
stream);
|
||||||
|
}
|
||||||
12
csrc/moe/torch_bindings.cpp
Normal file
12
csrc/moe/torch_bindings.cpp
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
#include "registration.h"
|
||||||
|
#include "moe_ops.h"
|
||||||
|
|
||||||
|
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
|
||||||
|
// Apply topk softmax to the gating outputs.
|
||||||
|
m.def(
|
||||||
|
"topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
|
||||||
|
"token_expert_indices, Tensor gating_output) -> ()");
|
||||||
|
m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
|
||||||
|
}
|
||||||
|
|
||||||
|
REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
|
||||||
134
csrc/moe_align_block_size_kernels.cu
Normal file
134
csrc/moe_align_block_size_kernels.cu
Normal file
@ -0,0 +1,134 @@
|
|||||||
|
#include <torch/all.h>
|
||||||
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
|
|
||||||
|
#include <ATen/ATen.h>
|
||||||
|
#include <THC/THCAtomics.cuh>
|
||||||
|
|
||||||
|
#include "cuda_compat.h"
|
||||||
|
#include "dispatch_utils.h"
|
||||||
|
|
||||||
|
#define CEILDIV(x, y) (((x) + (y) - 1) / (y))
|
||||||
|
|
||||||
|
namespace vllm {
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
|
||||||
|
int32_t col) {
|
||||||
|
// don't worry about overflow because num_experts is relatively small
|
||||||
|
return row * total_col + col;
|
||||||
|
}
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
template <typename scalar_t>
|
||||||
|
__global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
|
||||||
|
int32_t* sorted_token_ids,
|
||||||
|
int32_t* expert_ids,
|
||||||
|
int32_t* total_tokens_post_pad,
|
||||||
|
int32_t num_experts,
|
||||||
|
int32_t block_size, size_t numel) {
|
||||||
|
const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
|
||||||
|
const size_t start_idx = threadIdx.x * tokens_per_thread;
|
||||||
|
|
||||||
|
extern __shared__ int32_t shared_mem[];
|
||||||
|
|
||||||
|
int32_t* tokens_cnts =
|
||||||
|
shared_mem; // 2d tensor with shape (num_experts + 1, num_experts)
|
||||||
|
int32_t* cumsum =
|
||||||
|
shared_mem + (num_experts + 1) *
|
||||||
|
num_experts; // 1d tensor with shape (num_experts + 1)
|
||||||
|
|
||||||
|
for (int i = 0; i < num_experts; ++i) {
|
||||||
|
tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* In the first step we compute token_cnts[thread_index + 1][expert_index],
|
||||||
|
* which counts how many tokens in the token shard of thread_index are
|
||||||
|
* assigned to expert expert_index.
|
||||||
|
*/
|
||||||
|
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
|
||||||
|
++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])];
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// For each expert we accumulate the token counts from the different threads.
|
||||||
|
tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
|
||||||
|
for (int i = 1; i <= blockDim.x; ++i) {
|
||||||
|
tokens_cnts[index(num_experts, i, threadIdx.x)] +=
|
||||||
|
tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
// We accumulate the token counts of all experts in thread 0.
|
||||||
|
if (threadIdx.x == 0) {
|
||||||
|
cumsum[0] = 0;
|
||||||
|
for (int i = 1; i <= num_experts; ++i) {
|
||||||
|
cumsum[i] = cumsum[i - 1] +
|
||||||
|
CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
|
||||||
|
block_size) *
|
||||||
|
block_size;
|
||||||
|
}
|
||||||
|
*total_tokens_post_pad = cumsum[num_experts];
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* For each expert, each thread processes the tokens of the corresponding
|
||||||
|
* blocks and stores the corresponding expert_id for each block.
|
||||||
|
*/
|
||||||
|
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
|
||||||
|
i += block_size) {
|
||||||
|
expert_ids[i / block_size] = threadIdx.x;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Each thread processes a token shard, calculating the index of each token
|
||||||
|
* after sorting by expert number. Given the example topk_ids =
|
||||||
|
* [0,1,2,1,2,3,0,3,4] and block_size = 4, then the output would be [0, 6, *,
|
||||||
|
* *, 1, 3, *, *, 2, 4, *, *, 5, 7, *, *, 8, *, *, *], where * represents a
|
||||||
|
* padding value(preset in python).
|
||||||
|
*/
|
||||||
|
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
|
||||||
|
int32_t expert_id = topk_ids[i];
|
||||||
|
/** The cumsum[expert_id] stores the starting index of the tokens that the
|
||||||
|
* expert with expert_id needs to process, and
|
||||||
|
* tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
|
||||||
|
* processed by the expert with expert_id within the current thread's token
|
||||||
|
* shard.
|
||||||
|
*/
|
||||||
|
int32_t rank_post_pad =
|
||||||
|
tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
|
||||||
|
cumsum[expert_id];
|
||||||
|
sorted_token_ids[rank_post_pad] = i;
|
||||||
|
++tokens_cnts[index(num_experts, threadIdx.x, expert_id)];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // namespace vllm
|
||||||
|
|
||||||
|
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||||
|
int64_t block_size, torch::Tensor sorted_token_ids,
|
||||||
|
torch::Tensor experts_ids,
|
||||||
|
torch::Tensor num_tokens_post_pad) {
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
VLLM_DISPATCH_INTEGRAL_TYPES(
|
||||||
|
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
|
||||||
|
// calc needed amount of shared mem for `tokens_cnts` and `cumsum`
|
||||||
|
// tensors
|
||||||
|
const int32_t shared_mem =
|
||||||
|
((num_experts + 1) * num_experts + (num_experts + 1)) *
|
||||||
|
sizeof(int32_t);
|
||||||
|
|
||||||
|
// set dynamic shared mem
|
||||||
|
auto kernel = vllm::moe_align_block_size_kernel<scalar_t>;
|
||||||
|
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
|
||||||
|
(void*)kernel, shared_mem));
|
||||||
|
kernel<<<1, num_experts, shared_mem, stream>>>(
|
||||||
|
topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
|
||||||
|
experts_ids.data_ptr<int32_t>(),
|
||||||
|
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
|
||||||
|
topk_ids.numel());
|
||||||
|
});
|
||||||
|
}
|
||||||
146
csrc/ops.h
Normal file
146
csrc/ops.h
Normal file
@ -0,0 +1,146 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <torch/library.h>
|
||||||
|
|
||||||
|
void paged_attention_v1(
|
||||||
|
torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||||
|
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
|
||||||
|
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
|
||||||
|
int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
|
||||||
|
const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
|
||||||
|
const int64_t blocksparse_local_blocks,
|
||||||
|
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||||
|
const int64_t blocksparse_head_sliding_step);
|
||||||
|
|
||||||
|
void paged_attention_v2(
|
||||||
|
torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
|
||||||
|
torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
|
||||||
|
torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
|
||||||
|
torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
|
||||||
|
int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
|
||||||
|
const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
|
||||||
|
const int64_t blocksparse_local_blocks,
|
||||||
|
const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
|
||||||
|
const int64_t blocksparse_head_sliding_step);
|
||||||
|
|
||||||
|
void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
|
||||||
|
double epsilon);
|
||||||
|
|
||||||
|
void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
|
||||||
|
torch::Tensor& weight, double epsilon);
|
||||||
|
|
||||||
|
void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
|
||||||
|
torch::Tensor& key, int64_t head_size,
|
||||||
|
torch::Tensor& cos_sin_cache, bool is_neox);
|
||||||
|
|
||||||
|
void batched_rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
|
||||||
|
torch::Tensor& key, int64_t head_size,
|
||||||
|
torch::Tensor& cos_sin_cache, bool is_neox,
|
||||||
|
int64_t rot_dim,
|
||||||
|
torch::Tensor& cos_sin_cache_offsets);
|
||||||
|
|
||||||
|
void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
|
||||||
|
|
||||||
|
void gelu_and_mul(torch::Tensor& out, torch::Tensor& input);
|
||||||
|
|
||||||
|
void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input);
|
||||||
|
|
||||||
|
void gelu_new(torch::Tensor& out, torch::Tensor& input);
|
||||||
|
|
||||||
|
void gelu_fast(torch::Tensor& out, torch::Tensor& input);
|
||||||
|
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
|
||||||
|
const torch::Tensor& codebooks,
|
||||||
|
const torch::Tensor& scales,
|
||||||
|
const torch::Tensor& codebook_partition_sizes,
|
||||||
|
const std::optional<torch::Tensor>& bias);
|
||||||
|
|
||||||
|
torch::Tensor aqlm_dequant(const torch::Tensor& codes,
|
||||||
|
const torch::Tensor& codebooks,
|
||||||
|
const torch::Tensor& codebook_partition_sizes);
|
||||||
|
|
||||||
|
torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
|
||||||
|
torch::Tensor _scaling_factors, torch::Tensor _zeros,
|
||||||
|
int64_t split_k_iters);
|
||||||
|
|
||||||
|
torch::Tensor awq_dequantize(torch::Tensor _kernel,
|
||||||
|
torch::Tensor _scaling_factors,
|
||||||
|
torch::Tensor _zeros, int64_t split_k_iters,
|
||||||
|
int64_t thx, int64_t thy);
|
||||||
|
|
||||||
|
torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
||||||
|
torch::Tensor& b_scales, torch::Tensor& workspace,
|
||||||
|
int64_t size_m, int64_t size_n, int64_t size_k);
|
||||||
|
|
||||||
|
torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
||||||
|
torch::Tensor& b_meta,
|
||||||
|
torch::Tensor& b_scales,
|
||||||
|
torch::Tensor& workspace, int64_t num_bits,
|
||||||
|
int64_t size_m, int64_t size_n,
|
||||||
|
int64_t size_k);
|
||||||
|
|
||||||
|
torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
|
||||||
|
torch::Tensor& b_scales, torch::Tensor& g_idx,
|
||||||
|
torch::Tensor& perm, torch::Tensor& workspace,
|
||||||
|
int64_t num_bits, int64_t size_m, int64_t size_n,
|
||||||
|
int64_t size_k, bool is_k_full);
|
||||||
|
|
||||||
|
torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
|
||||||
|
int64_t size_k, int64_t size_n,
|
||||||
|
int64_t num_bits);
|
||||||
|
|
||||||
|
void cutlass_scaled_mm_dq(torch::Tensor& out, torch::Tensor const& a,
|
||||||
|
torch::Tensor const& b, torch::Tensor const& a_scales,
|
||||||
|
torch::Tensor const& b_scales);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||||
|
torch::Tensor const& scale);
|
||||||
|
|
||||||
|
void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
|
||||||
|
torch::Tensor& scales);
|
||||||
|
|
||||||
|
void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
|
||||||
|
torch::Tensor lookup_table);
|
||||||
|
|
||||||
|
torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
|
||||||
|
torch::Tensor b_gptq_qzeros,
|
||||||
|
torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
|
||||||
|
bool use_exllama, int64_t bit);
|
||||||
|
|
||||||
|
void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit);
|
||||||
|
|
||||||
|
void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor& input,
|
||||||
|
torch::Tensor& scale);
|
||||||
|
|
||||||
|
void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor& input,
|
||||||
|
torch::Tensor& scale);
|
||||||
|
|
||||||
|
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||||
|
int64_t block_size, torch::Tensor sorted_token_ids,
|
||||||
|
torch::Tensor experts_ids,
|
||||||
|
torch::Tensor num_tokens_post_pad);
|
||||||
|
|
||||||
|
#ifndef USE_ROCM
|
||||||
|
using fptr_t = int64_t;
|
||||||
|
fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data,
|
||||||
|
const std::vector<std::string>& handles,
|
||||||
|
const std::vector<int64_t>& offsets, int64_t rank,
|
||||||
|
bool full_nvlink);
|
||||||
|
bool should_custom_ar(torch::Tensor& inp, int64_t max_size, int64_t world_size,
|
||||||
|
bool full_nvlink);
|
||||||
|
void all_reduce_reg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out);
|
||||||
|
void all_reduce_unreg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& reg_buffer,
|
||||||
|
torch::Tensor& out);
|
||||||
|
void dispose(fptr_t _fa);
|
||||||
|
int64_t meta_size();
|
||||||
|
void register_buffer(fptr_t _fa, torch::Tensor& t,
|
||||||
|
const std::vector<std::string>& handles,
|
||||||
|
const std::vector<int64_t>& offsets);
|
||||||
|
std::tuple<torch::Tensor, std::vector<int64_t>> get_graph_buffer_ipc_meta(
|
||||||
|
fptr_t _fa);
|
||||||
|
void register_graph_buffers(fptr_t _fa, const std::vector<std::string>& handles,
|
||||||
|
const std::vector<std::vector<int64_t>>& offsets);
|
||||||
|
#endif
|
||||||
@ -1,16 +0,0 @@
|
|||||||
#include <torch/extension.h>
|
|
||||||
|
|
||||||
void rotary_embedding(
|
|
||||||
torch::Tensor& positions,
|
|
||||||
torch::Tensor& query,
|
|
||||||
torch::Tensor& key,
|
|
||||||
int head_size,
|
|
||||||
torch::Tensor& cos_sin_cache,
|
|
||||||
bool is_neox);
|
|
||||||
|
|
||||||
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
|
|
||||||
m.def(
|
|
||||||
"rotary_embedding",
|
|
||||||
&rotary_embedding,
|
|
||||||
"Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
|
|
||||||
}
|
|
||||||
@ -1,32 +1,30 @@
|
|||||||
#include <torch/extension.h>
|
#include <torch/all.h>
|
||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
|
#include <c10/cuda/CUDAGuard.h>
|
||||||
|
|
||||||
|
#include "cuda_compat.h"
|
||||||
#include "dispatch_utils.h"
|
#include "dispatch_utils.h"
|
||||||
|
|
||||||
namespace vllm {
|
namespace vllm {
|
||||||
|
|
||||||
template<typename scalar_t, bool IS_NEOX>
|
template <typename scalar_t, bool IS_NEOX>
|
||||||
inline __device__ void apply_rotary_embedding(
|
inline __device__ void apply_token_rotary_embedding(
|
||||||
scalar_t* __restrict__ arr,
|
scalar_t* __restrict__ arr, const scalar_t* __restrict__ cos_ptr,
|
||||||
const scalar_t* __restrict__ cos_ptr,
|
const scalar_t* __restrict__ sin_ptr, int rot_offset, int embed_dim) {
|
||||||
const scalar_t* __restrict__ sin_ptr,
|
|
||||||
int rot_offset,
|
|
||||||
int embed_dim)
|
|
||||||
{
|
|
||||||
int x_index, y_index;
|
int x_index, y_index;
|
||||||
scalar_t cos, sin;
|
scalar_t cos, sin;
|
||||||
if (IS_NEOX) {
|
if (IS_NEOX) {
|
||||||
// GPT-NeoX style rotary embedding.
|
// GPT-NeoX style rotary embedding.
|
||||||
x_index = rot_offset;
|
x_index = rot_offset;
|
||||||
y_index = embed_dim + rot_offset;
|
y_index = embed_dim + rot_offset;
|
||||||
cos = __ldg(cos_ptr + x_index);
|
cos = VLLM_LDG(cos_ptr + x_index);
|
||||||
sin = __ldg(sin_ptr + x_index);
|
sin = VLLM_LDG(sin_ptr + x_index);
|
||||||
} else {
|
} else {
|
||||||
// GPT-J style rotary embedding.
|
// GPT-J style rotary embedding.
|
||||||
x_index = 2 * rot_offset;
|
x_index = 2 * rot_offset;
|
||||||
y_index = 2 * rot_offset + 1;
|
y_index = 2 * rot_offset + 1;
|
||||||
cos = __ldg(cos_ptr + x_index / 2);
|
cos = VLLM_LDG(cos_ptr + x_index / 2);
|
||||||
sin = __ldg(sin_ptr + x_index / 2);
|
sin = VLLM_LDG(sin_ptr + x_index / 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
const scalar_t x = arr[x_index];
|
const scalar_t x = arr[x_index];
|
||||||
@ -35,23 +33,17 @@ inline __device__ void apply_rotary_embedding(
|
|||||||
arr[y_index] = y * cos + x * sin;
|
arr[y_index] = y * cos + x * sin;
|
||||||
}
|
}
|
||||||
|
|
||||||
template<typename scalar_t, bool IS_NEOX>
|
template <typename scalar_t, bool IS_NEOX>
|
||||||
__global__ void rotary_embedding_kernel(
|
inline __device__ void apply_rotary_embedding(
|
||||||
const int64_t* __restrict__ positions, // [batch_size, seq_len] or [num_tokens]
|
scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads,
|
||||||
scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size]
|
// head_size] or [num_tokens, num_heads,
|
||||||
scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size]
|
// head_size]
|
||||||
const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2]
|
scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads,
|
||||||
const int rot_dim,
|
// head_size] or [num_tokens, num_kv_heads,
|
||||||
const int query_stride,
|
// head_size]
|
||||||
const int key_stride,
|
const scalar_t* cache_ptr, const int head_size, const int num_heads,
|
||||||
const int num_heads,
|
const int num_kv_heads, const int rot_dim, const int token_idx,
|
||||||
const int num_kv_heads,
|
const int64_t query_stride, const int64_t key_stride) {
|
||||||
const int head_size) {
|
|
||||||
// Each thread block is responsible for one token.
|
|
||||||
const int token_idx = blockIdx.x;
|
|
||||||
int64_t pos = positions[token_idx];
|
|
||||||
const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
|
|
||||||
|
|
||||||
const int embed_dim = rot_dim / 2;
|
const int embed_dim = rot_dim / 2;
|
||||||
const scalar_t* cos_ptr = cache_ptr;
|
const scalar_t* cos_ptr = cache_ptr;
|
||||||
const scalar_t* sin_ptr = cache_ptr + embed_dim;
|
const scalar_t* sin_ptr = cache_ptr + embed_dim;
|
||||||
@ -59,69 +51,153 @@ __global__ void rotary_embedding_kernel(
|
|||||||
const int nq = num_heads * embed_dim;
|
const int nq = num_heads * embed_dim;
|
||||||
for (int i = threadIdx.x; i < nq; i += blockDim.x) {
|
for (int i = threadIdx.x; i < nq; i += blockDim.x) {
|
||||||
const int head_idx = i / embed_dim;
|
const int head_idx = i / embed_dim;
|
||||||
const int token_head = token_idx * query_stride + head_idx * head_size;
|
const int64_t token_head = token_idx * query_stride + head_idx * head_size;
|
||||||
const int rot_offset = i % embed_dim;
|
const int rot_offset = i % embed_dim;
|
||||||
apply_rotary_embedding<scalar_t, IS_NEOX>(query + token_head, cos_ptr,
|
apply_token_rotary_embedding<scalar_t, IS_NEOX>(
|
||||||
sin_ptr, rot_offset, embed_dim);
|
query + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim);
|
||||||
}
|
}
|
||||||
|
|
||||||
const int nk = num_kv_heads * embed_dim;
|
const int nk = num_kv_heads * embed_dim;
|
||||||
for (int i = threadIdx.x; i < nk; i += blockDim.x) {
|
for (int i = threadIdx.x; i < nk; i += blockDim.x) {
|
||||||
const int head_idx = i / embed_dim;
|
const int head_idx = i / embed_dim;
|
||||||
const int token_head = token_idx * key_stride + head_idx * head_size;
|
const int64_t token_head = token_idx * key_stride + head_idx * head_size;
|
||||||
const int rot_offset = i % embed_dim;
|
const int rot_offset = i % embed_dim;
|
||||||
apply_rotary_embedding<scalar_t, IS_NEOX>(key + token_head, cos_ptr,
|
apply_token_rotary_embedding<scalar_t, IS_NEOX>(
|
||||||
sin_ptr, rot_offset, embed_dim);
|
key + token_head, cos_ptr, sin_ptr, rot_offset, embed_dim);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace vllm
|
template <typename scalar_t, bool IS_NEOX>
|
||||||
|
__global__ void rotary_embedding_kernel(
|
||||||
|
const int64_t* __restrict__ positions, // [batch_size, seq_len] or
|
||||||
|
// [num_tokens]
|
||||||
|
scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads,
|
||||||
|
// head_size] or [num_tokens, num_heads,
|
||||||
|
// head_size]
|
||||||
|
scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads,
|
||||||
|
// head_size] or [num_tokens, num_kv_heads,
|
||||||
|
// head_size]
|
||||||
|
const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim //
|
||||||
|
// 2]
|
||||||
|
const int rot_dim, const int64_t query_stride, const int64_t key_stride,
|
||||||
|
const int num_heads, const int num_kv_heads, const int head_size) {
|
||||||
|
// Each thread block is responsible for one token.
|
||||||
|
const int token_idx = blockIdx.x;
|
||||||
|
int64_t pos = positions[token_idx];
|
||||||
|
const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
|
||||||
|
|
||||||
|
apply_rotary_embedding<scalar_t, IS_NEOX>(
|
||||||
|
query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim,
|
||||||
|
token_idx, query_stride, key_stride);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename scalar_t, bool IS_NEOX>
|
||||||
|
__global__ void batched_rotary_embedding_kernel(
|
||||||
|
const int64_t* __restrict__ positions, // [batch_size, seq_len] or
|
||||||
|
// [num_tokens]
|
||||||
|
scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads,
|
||||||
|
// head_size] or [num_tokens, num_heads,
|
||||||
|
// head_size]
|
||||||
|
scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads,
|
||||||
|
// head_size] or [num_tokens, num_kv_heads,
|
||||||
|
// head_size]
|
||||||
|
const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim //
|
||||||
|
// 2]
|
||||||
|
const int64_t* __restrict__ cos_sin_cache_offsets, // [batch_size, seq_len]
|
||||||
|
// or [num_tokens]
|
||||||
|
const int rot_dim, const int64_t query_stride, const int64_t key_stride,
|
||||||
|
const int num_heads, const int num_kv_heads, const int head_size) {
|
||||||
|
// Each thread block is responsible for one token.
|
||||||
|
const int token_idx = blockIdx.x;
|
||||||
|
int64_t pos = positions[token_idx];
|
||||||
|
int64_t cos_sin_cache_offset = cos_sin_cache_offsets[token_idx];
|
||||||
|
const scalar_t* cache_ptr =
|
||||||
|
cos_sin_cache + (cos_sin_cache_offset + pos) * rot_dim;
|
||||||
|
|
||||||
|
apply_rotary_embedding<scalar_t, IS_NEOX>(
|
||||||
|
query, key, cache_ptr, head_size, num_heads, num_kv_heads, rot_dim,
|
||||||
|
token_idx, query_stride, key_stride);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace vllm
|
||||||
|
|
||||||
void rotary_embedding(
|
void rotary_embedding(
|
||||||
torch::Tensor& positions, // [batch_size, seq_len] or [num_tokens]
|
torch::Tensor& positions, // [batch_size, seq_len] or [num_tokens]
|
||||||
torch::Tensor& query, // [batch_size, seq_len, num_heads * head_size] or [num_tokens, num_heads * head_size]
|
torch::Tensor& query, // [batch_size, seq_len, num_heads * head_size] or
|
||||||
torch::Tensor& key, // [batch_size, seq_len, num_kv_heads * head_size] or [num_tokens, num_kv_heads * head_size]
|
// [num_tokens, num_heads * head_size]
|
||||||
int head_size,
|
torch::Tensor& key, // [batch_size, seq_len, num_kv_heads * head_size] or
|
||||||
torch::Tensor& cos_sin_cache, // [max_position, rot_dim]
|
// [num_tokens, num_kv_heads * head_size]
|
||||||
bool is_neox) {
|
int64_t head_size,
|
||||||
|
torch::Tensor& cos_sin_cache, // [max_position, rot_dim]
|
||||||
|
bool is_neox) {
|
||||||
int64_t num_tokens = query.numel() / query.size(-1);
|
int64_t num_tokens = query.numel() / query.size(-1);
|
||||||
int rot_dim = cos_sin_cache.size(1);
|
int rot_dim = cos_sin_cache.size(1);
|
||||||
int num_heads = query.size(-1) / head_size;
|
int num_heads = query.size(-1) / head_size;
|
||||||
int num_kv_heads = key.size(-1) / head_size;
|
int num_kv_heads = key.size(-1) / head_size;
|
||||||
int query_stride = query.stride(-2);
|
int64_t query_stride = query.stride(-2);
|
||||||
int key_stride = key.stride(-2);
|
int64_t key_stride = key.stride(-2);
|
||||||
|
|
||||||
dim3 grid(num_tokens);
|
dim3 grid(num_tokens);
|
||||||
dim3 block(std::min(num_heads * rot_dim / 2, 512));
|
dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
|
||||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
VLLM_DISPATCH_FLOATING_TYPES(
|
VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] {
|
||||||
query.scalar_type(),
|
if (is_neox) {
|
||||||
"rotary_embedding",
|
vllm::rotary_embedding_kernel<scalar_t, true><<<grid, block, 0, stream>>>(
|
||||||
[&] {
|
positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
|
||||||
if (is_neox) {
|
key.data_ptr<scalar_t>(), cos_sin_cache.data_ptr<scalar_t>(), rot_dim,
|
||||||
vllm::rotary_embedding_kernel<scalar_t, true><<<grid, block, 0, stream>>>(
|
query_stride, key_stride, num_heads, num_kv_heads, head_size);
|
||||||
positions.data_ptr<int64_t>(),
|
} else {
|
||||||
query.data_ptr<scalar_t>(),
|
vllm::rotary_embedding_kernel<scalar_t, false>
|
||||||
key.data_ptr<scalar_t>(),
|
<<<grid, block, 0, stream>>>(
|
||||||
cos_sin_cache.data_ptr<scalar_t>(),
|
positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
|
||||||
rot_dim,
|
key.data_ptr<scalar_t>(), cos_sin_cache.data_ptr<scalar_t>(),
|
||||||
query_stride,
|
rot_dim, query_stride, key_stride, num_heads, num_kv_heads,
|
||||||
key_stride,
|
head_size);
|
||||||
num_heads,
|
}
|
||||||
num_kv_heads,
|
});
|
||||||
head_size);
|
}
|
||||||
} else {
|
|
||||||
vllm::rotary_embedding_kernel<scalar_t, false><<<grid, block, 0, stream>>>(
|
/*
|
||||||
positions.data_ptr<int64_t>(),
|
Batched version of rotary embedding, pack multiple LoRAs together
|
||||||
query.data_ptr<scalar_t>(),
|
and process in batched manner.
|
||||||
key.data_ptr<scalar_t>(),
|
*/
|
||||||
cos_sin_cache.data_ptr<scalar_t>(),
|
void batched_rotary_embedding(
|
||||||
rot_dim,
|
torch::Tensor& positions, // [batch_size, seq_len] or [num_tokens]
|
||||||
query_stride,
|
torch::Tensor& query, // [batch_size, seq_len, num_heads * head_size] or
|
||||||
key_stride,
|
// [num_tokens, num_heads * head_size]
|
||||||
num_heads,
|
torch::Tensor& key, // [batch_size, seq_len, num_kv_heads * head_size] or
|
||||||
num_kv_heads,
|
// [num_tokens, num_kv_heads * head_size]
|
||||||
head_size);
|
int64_t head_size,
|
||||||
}
|
torch::Tensor& cos_sin_cache, // [max_position, rot_dim]
|
||||||
});
|
bool is_neox, int64_t rot_dim,
|
||||||
|
torch::Tensor& cos_sin_cache_offsets // [num_tokens]
|
||||||
|
) {
|
||||||
|
int64_t num_tokens = cos_sin_cache_offsets.size(0);
|
||||||
|
int num_heads = query.size(-1) / head_size;
|
||||||
|
int num_kv_heads = key.size(-1) / head_size;
|
||||||
|
int64_t query_stride = query.stride(-2);
|
||||||
|
int64_t key_stride = key.stride(-2);
|
||||||
|
|
||||||
|
dim3 grid(num_tokens);
|
||||||
|
dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
|
||||||
|
const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
|
||||||
|
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||||
|
VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] {
|
||||||
|
if (is_neox) {
|
||||||
|
vllm::batched_rotary_embedding_kernel<scalar_t, true>
|
||||||
|
<<<grid, block, 0, stream>>>(
|
||||||
|
positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
|
||||||
|
key.data_ptr<scalar_t>(), cos_sin_cache.data_ptr<scalar_t>(),
|
||||||
|
cos_sin_cache_offsets.data_ptr<int64_t>(), rot_dim, query_stride,
|
||||||
|
key_stride, num_heads, num_kv_heads, head_size);
|
||||||
|
} else {
|
||||||
|
vllm::batched_rotary_embedding_kernel<scalar_t, false>
|
||||||
|
<<<grid, block, 0, stream>>>(
|
||||||
|
positions.data_ptr<int64_t>(), query.data_ptr<scalar_t>(),
|
||||||
|
key.data_ptr<scalar_t>(), cos_sin_cache.data_ptr<scalar_t>(),
|
||||||
|
cos_sin_cache_offsets.data_ptr<int64_t>(), rot_dim, query_stride,
|
||||||
|
key_stride, num_heads, num_kv_heads, head_size);
|
||||||
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user