mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 23:03:52 +08:00
Compare commits
577 Commits
v0.9.0.1
...
gemma3n-mm
| Author | SHA1 | Date | |
|---|---|---|---|
| b801bf30d7 | |||
| bfd63b1b10 | |||
| 3c545c0c3b | |||
| e8c3bd2cd1 | |||
| c6c983053d | |||
| aafabaa0d5 | |||
| 94a55c7681 | |||
| aa0dc77ef5 | |||
| 4ab3ac285e | |||
| d1c956dc0f | |||
| dec197e3e5 | |||
| 6e244ae091 | |||
| cd4cfee689 | |||
| e110930680 | |||
| 8b64c895c0 | |||
| 0740e29b66 | |||
| 44d2e6af63 | |||
| 2d7779f888 | |||
| a57d57fa72 | |||
| 71799fd005 | |||
| e9fd658a73 | |||
| 07b8fae219 | |||
| 562308816c | |||
| 04e1642e32 | |||
| b69781f107 | |||
| 0bceac9810 | |||
| 34878a0b48 | |||
| 6393b03986 | |||
| 0907d507bf | |||
| c894c5dc1f | |||
| 1f5d178e9c | |||
| 27c065df50 | |||
| 84c260caeb | |||
| 167aca45cb | |||
| 0567c8249f | |||
| d188913d99 | |||
| 1d7c29f5fe | |||
| 65397e40f5 | |||
| 9502c38138 | |||
| 2582683566 | |||
| 754b00edb3 | |||
| 296ce95d8e | |||
| 2d7620c3eb | |||
| 55c65ab495 | |||
| 2cc2069970 | |||
| 9f0608fc16 | |||
| 4e0db57fff | |||
| c40692bf9a | |||
| 4734704b30 | |||
| 8b8c209e35 | |||
| 23a04e0895 | |||
| 02c97d9a92 | |||
| e795d723ed | |||
| 8359f4c8d8 | |||
| bf5181583f | |||
| c53fec1fcb | |||
| 0f9e7354f5 | |||
| ba7ba35cda | |||
| 015fab8c2f | |||
| f59fc60fb3 | |||
| 879f69bed3 | |||
| 7108934142 | |||
| 3443aaf8dd | |||
| 2273ec322c | |||
| a6c4b87fbc | |||
| 1afa9948f5 | |||
| 0d06b533a0 | |||
| c01d1c5aba | |||
| ead369845d | |||
| c6e3bba8e6 | |||
| 91f7d9d0b6 | |||
| 8619e7158c | |||
| c635c5f744 | |||
| a045b7e89a | |||
| 981eeca41a | |||
| 26d34eb67e | |||
| 53da4cd397 | |||
| 9a3b88328f | |||
| 3014c920da | |||
| 0eed516951 | |||
| ee5ad8d2c5 | |||
| a738dbb2a1 | |||
| 33d5e29be9 | |||
| 4671ac6e2a | |||
| dd2ccf8dde | |||
| a3bc76e4b5 | |||
| e6327c9b3e | |||
| d0132f025d | |||
| 61f4fc5dc6 | |||
| 68aaeb3749 | |||
| c3649e4fee | |||
| 53243e5c42 | |||
| a6e6604d32 | |||
| b82e0f82cb | |||
| 5111642a6f | |||
| 1bcd15edc7 | |||
| 2ebff5b77c | |||
| f17aec0d63 | |||
| 493c275352 | |||
| f39ab2d4bd | |||
| 4a0f7888a3 | |||
| c4cf260677 | |||
| 33d51f599e | |||
| e91386cde1 | |||
| 2c11a29f0b | |||
| c76a506bd6 | |||
| ec0db6f51c | |||
| c305a2109d | |||
| 202c5df935 | |||
| 2bb246b8f7 | |||
| 4c409cabc2 | |||
| 3b1e4c6a23 | |||
| 2c5302fadd | |||
| caa680fd2e | |||
| c3bf9bad11 | |||
| 6f170f11dd | |||
| 8ca81bb069 | |||
| e773a9e1c2 | |||
| 71baf85ae1 | |||
| 79f2f1c2a1 | |||
| 2e3e3c86dc | |||
| 7e8977fcd4 | |||
| f1e840e842 | |||
| 7771d1de88 | |||
| 71d1219545 | |||
| e384f2f108 | |||
| 089a306f19 | |||
| 5e666f72cd | |||
| e3a3e4db46 | |||
| e41bf15cd0 | |||
| 5aa4a015ce | |||
| b6bad3d186 | |||
| ee9a1531aa | |||
| 10d82f9ac5 | |||
| ea10dd9d9e | |||
| ead2110297 | |||
| 01220ce89a | |||
| 6f68c49220 | |||
| 4719460644 | |||
| 466166dcfd | |||
| 1d0ae26c85 | |||
| 6021999573 | |||
| c7b370c603 | |||
| aa20d10a91 | |||
| 2de12be428 | |||
| 83ca9ae47b | |||
| e2148dc5ea | |||
| b1098b4072 | |||
| 799397ee4f | |||
| 4959915089 | |||
| 8d1e89d946 | |||
| 36239f79dd | |||
| dfada85eee | |||
| ed33349738 | |||
| d49adea1f9 | |||
| 14fdd21d39 | |||
| 04fefe7c9a | |||
| 3b523e38d9 | |||
| 16c16301c8 | |||
| 9206d0ff01 | |||
| a89209b78d | |||
| ffacb222cb | |||
| 12575cfa7a | |||
| 8b6e1d639c | |||
| 735a9de71f | |||
| 257ab95439 | |||
| cca91a7a10 | |||
| f04d604567 | |||
| 19a53b2783 | |||
| eccdc8318c | |||
| 5f52a84685 | |||
| d4629dc43f | |||
| 6e9cc73f67 | |||
| c53711bd63 | |||
| dac8cc49f4 | |||
| a44b1c951d | |||
| b447624ee3 | |||
| cda92307c1 | |||
| bf57ccc5c2 | |||
| ffb2cd6b54 | |||
| ca94d7fa00 | |||
| 5a1c2e15d8 | |||
| 4c8f64faa7 | |||
| 93aee29fdb | |||
| 154d063b9f | |||
| ccd7c05089 | |||
| c48c6c4008 | |||
| aed8468642 | |||
| 5c76b9cdaf | |||
| ddfed314f9 | |||
| 5b3ad5ecf2 | |||
| ede5c4ebdf | |||
| 07334959d8 | |||
| 119f683949 | |||
| 0860087aff | |||
| 6bc7b57315 | |||
| 90f9c2eb5c | |||
| 387bdf0ab9 | |||
| 5e5baa91aa | |||
| 836d4ce140 | |||
| c3fec47bb7 | |||
| 1173804dca | |||
| 4d5424029b | |||
| 3e7506975c | |||
| ee35e96ac3 | |||
| dec66d253b | |||
| 8d120701fd | |||
| f40f763f12 | |||
| 26bc46ef89 | |||
| a77aea59fd | |||
| b692e9cd07 | |||
| 367871a469 | |||
| 92183b41f3 | |||
| c6703d1e0d | |||
| a5e7242d5f | |||
| 91b2c17a55 | |||
| 055915e6ce | |||
| 3d330c4c09 | |||
| 0b73736a0d | |||
| ee1531bc38 | |||
| e13945f9dd | |||
| 08500011d3 | |||
| 861a0a0a39 | |||
| bc956b38d0 | |||
| 294fc1e2c9 | |||
| 2db9044ab6 | |||
| 6fa718a460 | |||
| 06be858828 | |||
| d1e34cc9ac | |||
| bd517eb9fe | |||
| d65668b4e8 | |||
| aafbbd981f | |||
| 0f0874515a | |||
| 3597b06a4f | |||
| 1015296b79 | |||
| ce9dc02c93 | |||
| a24cb91600 | |||
| 7e8d97dd3f | |||
| d70bc7c029 | |||
| ce688ad46e | |||
| cefdb9962d | |||
| ace5cdaff0 | |||
| 6458721108 | |||
| bb4a0decef | |||
| c707cfc12e | |||
| 7b3c9ff91d | |||
| c68698b326 | |||
| e3b12667d4 | |||
| e6aab5de29 | |||
| c57bb199b3 | |||
| dba68f9159 | |||
| a3319f4f04 | |||
| 9d880f594d | |||
| 017ef648e9 | |||
| 4b25ab14e2 | |||
| f98548b9da | |||
| 96846bb360 | |||
| b6efafd9e4 | |||
| 1129e2b1ab | |||
| c742438f8b | |||
| 73e2e0118f | |||
| c9280e6346 | |||
| af09b3f0a0 | |||
| 4f6c42fa0a | |||
| dff680001d | |||
| 2e090bd5df | |||
| 1b0b065eb5 | |||
| d5bdf899e4 | |||
| 7e3e74c97c | |||
| 3f6341bf7f | |||
| e5d35d62f5 | |||
| 2f1c19b245 | |||
| 42f52cc95b | |||
| 97a9465bbc | |||
| c7ea0b56cd | |||
| 29fa5cac1c | |||
| b2d9be6f7d | |||
| 04a55612dd | |||
| 89b0f84e17 | |||
| 497a91e9f7 | |||
| 943ffa5703 | |||
| 5c8d34a42c | |||
| 3c8694eabe | |||
| 7484e1fce2 | |||
| a2142f0196 | |||
| 871d6b7c74 | |||
| 29a38f0352 | |||
| a5115f4ff5 | |||
| 68b4a26149 | |||
| b8e809a057 | |||
| 5039ec2336 | |||
| 7c644ab6d5 | |||
| 2d40665fe8 | |||
| 96ada386b7 | |||
| 1e473b3010 | |||
| 2b1e2111b0 | |||
| a45b979d9f | |||
| 3952731e8f | |||
| 77f0d465d0 | |||
| 22c3c0aa4a | |||
| 33f8dba7c6 | |||
| 5241ca50d6 | |||
| da9b523ce1 | |||
| b6553be1bc | |||
| 64a9af5afa | |||
| e4248849ec | |||
| 467bef18a3 | |||
| 5f1ac1e1d1 | |||
| 9368cc90b2 | |||
| 32b3946bb4 | |||
| 6b1391ca7e | |||
| a3f66e75d1 | |||
| 319cb1e351 | |||
| 1efef71645 | |||
| 646d62f636 | |||
| 6cd4ae8acd | |||
| c016047ed7 | |||
| 9af6d22e4c | |||
| 4589b94032 | |||
| cc867be19c | |||
| 3a7cd627a8 | |||
| 8058c91108 | |||
| 7d44c469fe | |||
| 31f58be96a | |||
| ebb2f383b8 | |||
| c1c7dbbeeb | |||
| 5cf2daea9a | |||
| b8089195b4 | |||
| 770e5dcdb8 | |||
| c57c9415b1 | |||
| 01810f9236 | |||
| 59abbd84f9 | |||
| 95a6568b5c | |||
| 0eca5eacd0 | |||
| 12e5829221 | |||
| 3a4d417707 | |||
| 8335667c22 | |||
| e1c4380d4c | |||
| e31ae3de36 | |||
| 2ffb9b6e07 | |||
| cda10fa3e2 | |||
| c123bc33f9 | |||
| b9a1791e2c | |||
| 989dcee981 | |||
| 3d64d366e0 | |||
| eaa2e51088 | |||
| d77f7fb871 | |||
| 2d8476e465 | |||
| 88be823d57 | |||
| 4e4f63ad45 | |||
| d2f0e7e615 | |||
| 122cdca5f6 | |||
| cf02f9b283 | |||
| c4296b1a27 | |||
| 66c508b137 | |||
| 84166fee97 | |||
| 6e0cd10f72 | |||
| e010688f50 | |||
| 441b65d8c7 | |||
| 46ecc57973 | |||
| b6a3a9f76d | |||
| ca27f0f9c1 | |||
| aad30bd306 | |||
| 94ecee6282 | |||
| 8267f9916f | |||
| 7353492a47 | |||
| 7661e92ef8 | |||
| f168b85725 | |||
| da511d54d8 | |||
| 65c69444b1 | |||
| 94870359cd | |||
| 0d49483ea9 | |||
| 90b78ec5f9 | |||
| 91a2ef98ea | |||
| 3da2313d78 | |||
| b61dc5f972 | |||
| f8a1a2d108 | |||
| 3465b87ef8 | |||
| c8134bea15 | |||
| cb6d572e85 | |||
| 87360308b7 | |||
| aa49f14832 | |||
| 9ef9173cfa | |||
| 85e2b7bb13 | |||
| 61059bee40 | |||
| ec89524f50 | |||
| f20f9f063b | |||
| 9bc8bb07cf | |||
| 1aeb925f34 | |||
| 188a4590d8 | |||
| 18093084be | |||
| da40380214 | |||
| 8fc57501d3 | |||
| af7fc84fd2 | |||
| 0678b52251 | |||
| 25b918eee6 | |||
| a408820f2f | |||
| c56ed8bb0e | |||
| 78dcf56cb3 | |||
| b2fac67130 | |||
| 23027e2daf | |||
| c3fd4d669a | |||
| ef3f98b59f | |||
| 7ee2590478 | |||
| 53a5a0ce30 | |||
| d459fae0a2 | |||
| c8dcc15921 | |||
| 8f4ffbd373 | |||
| 5f2cd251d2 | |||
| 02658c2dfe | |||
| 01dc9a76db | |||
| 35cf32df30 | |||
| 8711bc5e68 | |||
| 2669a0d7b5 | |||
| 8e972d9c44 | |||
| 3336c8cfbe | |||
| b124e1085b | |||
| 41aa578428 | |||
| 8d646c2e53 | |||
| 5d6d1adf15 | |||
| 1409ef9134 | |||
| 4555143ea7 | |||
| 52dceb172d | |||
| abd7df2fca | |||
| b712be98c7 | |||
| a8da78eac9 | |||
| 5d96533e22 | |||
| 4de790fcad | |||
| b5fd9506c1 | |||
| 135cf55cd1 | |||
| 6cac54f4d1 | |||
| 6865fe0074 | |||
| e31446b6c8 | |||
| bdf13965ab | |||
| fa98d77773 | |||
| 01eee40536 | |||
| 19bdaf32b1 | |||
| 02f0c7b220 | |||
| d054da1992 | |||
| 4b7817c119 | |||
| d00dd65cd4 | |||
| d81edded69 | |||
| 476844d44c | |||
| 4e68ae5e59 | |||
| 4e88723f32 | |||
| 118ff92111 | |||
| ec2dcd80bc | |||
| 42243fbda0 | |||
| 6d18ed2a2e | |||
| f32fcd9444 | |||
| d32aa2e670 | |||
| cc977286e7 | |||
| 17430e3653 | |||
| 1282bd812e | |||
| bdce64f236 | |||
| 9e6f61e8c3 | |||
| 8655f47f37 | |||
| 4ce42f9204 | |||
| 8a57872b2a | |||
| 5bc1ad6cee | |||
| 9112b443a0 | |||
| c57d577e8d | |||
| ca2f6b9c30 | |||
| 20133cfee2 | |||
| ebb1ec9318 | |||
| 5b168b6d7a | |||
| 9760fd8f6a | |||
| b9f61e1387 | |||
| d6fd3a33b8 | |||
| 432ec9926e | |||
| 2b102d51ad | |||
| aa54a7bf7b | |||
| 2ad6194a02 | |||
| c594cbf565 | |||
| a35ca765a5 | |||
| 6aa8f9a4e7 | |||
| 1bc86a3da1 | |||
| bbfa0c61d1 | |||
| 20079c6e36 | |||
| 9a1b9b99d7 | |||
| 8bf507d766 | |||
| 306d60401d | |||
| f2c3f66d59 | |||
| 0f5e0d567e | |||
| c55d804672 | |||
| 749f5bdd38 | |||
| 2a50ef5760 | |||
| b8b904795d | |||
| ba5111f237 | |||
| 1e123529d7 | |||
| dff80b0e42 | |||
| 7782464a17 | |||
| 0f71e24034 | |||
| 1dab4d5718 | |||
| 7f21e8052b | |||
| 5a8641638a | |||
| f49239cb45 | |||
| 2dbe8c0774 | |||
| 84ec470fca | |||
| b29ca5c4d5 | |||
| ec6833c5e9 | |||
| e1fadf1197 | |||
| 43ff405b90 | |||
| fba02e3bd1 | |||
| 4577fc9abb | |||
| 5f1d0c8118 | |||
| c3bb9f2331 | |||
| 8f8900cee9 | |||
| 6acb7a6285 | |||
| 4f4a6b844a | |||
| 4d0a1541be | |||
| 77b6e74fe2 | |||
| 5acf828d99 | |||
| 3987e2ae96 | |||
| 77164dad5e | |||
| 3de3eadf5b | |||
| 3132290a14 | |||
| 1aa2f81b43 | |||
| d54af615d5 | |||
| a1cc9f33a3 | |||
| a521ef06e5 | |||
| 64eaf5fe05 | |||
| d1d61f3351 | |||
| 32ce3cf7c9 | |||
| d58f9c7f7a | |||
| c29034037d | |||
| 1b7cfd5a36 | |||
| da4b69d0b4 | |||
| c9479b2920 | |||
| 6f2909405e | |||
| b169d5f7b6 | |||
| f8977c233f | |||
| f274581f44 | |||
| 0b1447f890 | |||
| 24d0ef8970 | |||
| 7fcfd954ff | |||
| e740d07f07 | |||
| a652e71dd0 | |||
| 34d6c447c4 | |||
| 972eddf7c9 | |||
| fd7bb88d72 | |||
| 3c49dbdd03 | |||
| 1661a9c28f | |||
| 8e882ffdc0 | |||
| 26b4fa45be | |||
| 515b413ebf | |||
| 269d901734 | |||
| 7951d78738 | |||
| 6dbe5b5c93 | |||
| 643622ba46 | |||
| a09c7ca9f2 | |||
| 0e98964e94 | |||
| c68b5c63eb | |||
| fced756923 | |||
| 321331b8ae | |||
| 6e4cea1cc5 | |||
| 435fa95444 | |||
| 4c2b38ce9e | |||
| d781930f90 | |||
| ce75efeecb | |||
| aa42561e40 | |||
| de65fc8e1e | |||
| 0c492b7824 | |||
| 0f0926b43f | |||
| 7f2c1a87e9 | |||
| b78f844a67 | |||
| 5e13c07d00 | |||
| 774c5fde30 | |||
| 9a21e331ff | |||
| 3e9ce609bd | |||
| 794ae1f551 | |||
| d73a9457a5 | |||
| a3896c7f02 | |||
| 51e98e4ffd | |||
| e56f44d9ec | |||
| e0cbad4e30 | |||
| b48d5cca16 |
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import os
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
LM eval harness on model to compare vs HF baseline computed offline.
|
||||
Configs are found in configs/$MODEL.yaml
|
||||
|
||||
@ -113,7 +113,7 @@ WARNING: The benchmarking script will save json results by itself, so please do
|
||||
|
||||
### Visualizing the results
|
||||
|
||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
|
||||
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](performance-benchmarks-descriptions.md) with real benchmarking results.
|
||||
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
|
||||
If you do not see the table, please wait till the benchmark finish running.
|
||||
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
|
||||
|
||||
@ -16,7 +16,7 @@ Please download the visualization scripts in the post
|
||||
- Download `nightly-benchmarks.zip`.
|
||||
- In the same folder, run the following code:
|
||||
|
||||
```console
|
||||
```bash
|
||||
export HF_TOKEN=<your HF token>
|
||||
apt update
|
||||
apt install -y git
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import json
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from lmdeploy.serve.openai.api_client import APIClient
|
||||
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import datetime
|
||||
import json
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
steps:
|
||||
- label: "Build wheel - CUDA 12.8"
|
||||
id: build-wheel-cuda-12-8
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
@ -11,6 +12,7 @@ steps:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- label: "Build wheel - CUDA 12.6"
|
||||
id: build-wheel-cuda-12-6
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
@ -28,6 +30,7 @@ steps:
|
||||
|
||||
- label: "Build wheel - CUDA 11.8"
|
||||
# depends_on: block-build-cu118-wheel
|
||||
id: build-wheel-cuda-11-8
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
@ -44,6 +47,7 @@ steps:
|
||||
|
||||
- label: "Build release image"
|
||||
depends_on: block-release-image-build
|
||||
id: build-release-image
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
@ -51,6 +55,18 @@ steps:
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain -f docker/Dockerfile ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
|
||||
|
||||
- label: "Annotate release workflow"
|
||||
depends_on:
|
||||
- build-release-image
|
||||
- build-wheel-cuda-12-8
|
||||
- build-wheel-cuda-12-6
|
||||
- build-wheel-cuda-11-8
|
||||
id: annotate-release-workflow
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/annotate-release.sh"
|
||||
|
||||
- label: "Build and publish TPU release image"
|
||||
depends_on: ~
|
||||
if: build.env("NIGHTLY") == "1"
|
||||
@ -70,9 +86,10 @@ steps:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
- input: "Provide Release version here"
|
||||
id: input-release-version
|
||||
fields:
|
||||
- text: "What is the release version?"
|
||||
key: "release-version"
|
||||
key: release-version
|
||||
|
||||
- block: "Build CPU release image"
|
||||
key: block-cpu-release-image-build
|
||||
@ -85,6 +102,7 @@ steps:
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest --progress plain --target vllm-openai -f docker/Dockerfile.cpu ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:latest"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
@ -100,6 +118,7 @@ steps:
|
||||
commands:
|
||||
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
|
||||
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg GIT_REPO_CHECK=1 --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version) --tag public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest --progress plain -f docker/Dockerfile.neuron ."
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:latest"
|
||||
- "docker push public.ecr.aws/q9t5s3a7/vllm-neuron-release-repo:$(buildkite-agent meta-data get release-version)"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
31
.buildkite/scripts/annotate-release.sh
Executable file
31
.buildkite/scripts/annotate-release.sh
Executable file
@ -0,0 +1,31 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -ex
|
||||
|
||||
# Get release version and strip leading 'v' if present
|
||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version | sed 's/^v//')
|
||||
|
||||
if [ -z "$RELEASE_VERSION" ]; then
|
||||
echo "Error: RELEASE_VERSION is empty. 'release-version' metadata might not be set or is invalid."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
|
||||
To download the wheel:
|
||||
\`\`\`
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}/vllm-${RELEASE_VERSION}-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu126/vllm-${RELEASE_VERSION}+cu126-cp38-abi3-manylinux1_x86_64.whl .
|
||||
aws s3 cp s3://vllm-wheels/${RELEASE_VERSION}+cu118/vllm-${RELEASE_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl .
|
||||
\`\`\`
|
||||
|
||||
To download and upload the image:
|
||||
|
||||
\`\`\`
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT} vllm/vllm-openai
|
||||
docker tag vllm/vllm-openai vllm/vllm-openai:latest
|
||||
docker tag vllm/vllm-openai vllm/vllm-openai:v${RELEASE_VERSION}
|
||||
docker push vllm/vllm-openai:latest
|
||||
docker push vllm/vllm-openai:v${RELEASE_VERSION}
|
||||
\`\`\`
|
||||
EOF
|
||||
17
.buildkite/scripts/ci-clean-log.sh
Normal file
17
.buildkite/scripts/ci-clean-log.sh
Normal file
@ -0,0 +1,17 @@
|
||||
#!/bin/bash
|
||||
# Usage: ./ci_clean_log.sh ci.log
|
||||
# This script strips timestamps and color codes from CI log files.
|
||||
|
||||
# Check if argument is given
|
||||
if [ $# -lt 1 ]; then
|
||||
echo "Usage: $0 ci.log"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
INPUT_FILE="$1"
|
||||
|
||||
# Strip timestamps
|
||||
sed -i 's/^\[[0-9]\{4\}-[0-9]\{2\}-[0-9]\{2\}T[0-9]\{2\}:[0-9]\{2\}:[0-9]\{2\}Z\] //' "$INPUT_FILE"
|
||||
|
||||
# Strip colorization
|
||||
sed -i -r 's/\x1B\[[0-9;]*[mK]//g' "$INPUT_FILE"
|
||||
@ -94,6 +94,10 @@ if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
|
||||
commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
|
||||
fi
|
||||
|
||||
if [[ $commands == *"pytest -v -s lora"* ]]; then
|
||||
commands=${commands//"pytest -v -s lora"/"VLLM_ROCM_CUSTOM_PAGED_ATTN=0 pytest -v -s lora"}
|
||||
fi
|
||||
|
||||
#ignore certain kernels tests
|
||||
if [[ $commands == *" kernels/core"* ]]; then
|
||||
commands="${commands} \
|
||||
|
||||
@ -7,6 +7,7 @@ set -ex
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
if [[ -n "$container_id" ]]; then
|
||||
podman stop --all -t0
|
||||
podman rm -f "$container_id" || true
|
||||
fi
|
||||
podman system prune -f
|
||||
@ -37,7 +38,7 @@ function cpu_tests() {
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-facebook/opt-125m]
|
||||
pytest -v -s tests/models/language/generation/test_common.py::test_models[False-5-32-google/gemma-1.1-2b-it]
|
||||
pytest -v -s tests/models/language/pooling/test_classification.py::test_models[float-jason9693/Qwen2.5-1.5B-apeach]
|
||||
pytest -v -s tests/models/language/pooling/test_embedding.py::test_models[half-BAAI/bge-base-en-v1.5]"
|
||||
pytest -v -s tests/models/language/pooling/test_embedding.py -m cpu_model"
|
||||
}
|
||||
|
||||
# All of CPU tests are expected to be finished less than 40 mins.
|
||||
|
||||
@ -6,75 +6,82 @@ set -ex
|
||||
|
||||
# allow to bind to different cores
|
||||
CORE_RANGE=${CORE_RANGE:-48-95}
|
||||
OMP_CORE_RANGE=${OMP_CORE_RANGE:-48-95}
|
||||
NUMA_NODE=${NUMA_NODE:-1}
|
||||
|
||||
export CMAKE_BUILD_PARALLEL_LEVEL=32
|
||||
|
||||
# Setup cleanup
|
||||
remove_docker_container() {
|
||||
set -e;
|
||||
docker rm -f cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" || true;
|
||||
docker image rm cpu-test-"$BUILDKITE_BUILD_NUMBER" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 || true;
|
||||
docker rm -f cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"-avx2 || true;
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
remove_docker_container
|
||||
|
||||
# Try building the docker image
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$BUILDKITE_BUILD_NUMBER" --target vllm-test -f docker/Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --tag cpu-test-"$NUMA_NODE" --target vllm-test -f docker/Dockerfile.cpu .
|
||||
numactl -C "$CORE_RANGE" -N "$NUMA_NODE" docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" --tag cpu-test-"$NUMA_NODE"-avx2 --target vllm-test -f docker/Dockerfile.cpu .
|
||||
|
||||
# Run the image, setting --shm-size=4g for tensor parallel.
|
||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
|
||||
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
|
||||
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE" cpu-test-"$NUMA_NODE"
|
||||
docker run -itd --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --env VLLM_CPU_OMP_THREADS_BIND="$OMP_CORE_RANGE" --env VLLM_CPU_CI_ENV=1 --shm-size=4g --name cpu-test-"$NUMA_NODE"-avx2 cpu-test-"$NUMA_NODE"-avx2
|
||||
|
||||
function cpu_tests() {
|
||||
set -e
|
||||
export NUMA_NODE=$2
|
||||
export BUILDKITE_BUILD_NUMBER=$3
|
||||
|
||||
# list packages
|
||||
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
|
||||
set -e
|
||||
pip list"
|
||||
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pip list"
|
||||
|
||||
# offline inference
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
|
||||
docker exec cpu-test-"$NUMA_NODE"-avx2 bash -c "
|
||||
set -e
|
||||
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
|
||||
|
||||
# Run basic model test
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -v -s tests/kernels/test_cache.py -m cpu_model
|
||||
pytest -v -s tests/kernels/test_mla_decode_cpu.py -m cpu_model
|
||||
pytest -v -s tests/models/decoder_only/language -m cpu_model
|
||||
pytest -v -s tests/models/embedding/language -m cpu_model
|
||||
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
|
||||
pytest -v -s tests/models/decoder_only/audio_language -m cpu_model
|
||||
pytest -v -s tests/models/decoder_only/vision_language -m cpu_model"
|
||||
pytest -v -s tests/kernels/attention/test_cache.py -m cpu_model
|
||||
pytest -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
|
||||
pytest -v -s tests/models/language/generation -m cpu_model
|
||||
pytest -v -s tests/models/language/pooling -m cpu_model
|
||||
pytest -v -s tests/models/multimodal/generation \
|
||||
--ignore=tests/models/multimodal/generation/test_mllama.py \
|
||||
--ignore=tests/models/multimodal/generation/test_pixtral.py \
|
||||
-m cpu_model"
|
||||
|
||||
# Run compressed-tensor test
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -s -v \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
|
||||
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
|
||||
|
||||
# Run AWQ test
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -s -v \
|
||||
VLLM_USE_V1=0 pytest -s -v \
|
||||
tests/quantization/test_ipex_quant.py"
|
||||
|
||||
# Run chunked-prefill and prefix-cache test
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -s -v -k cpu_model \
|
||||
tests/basic_correctness/test_chunked_prefill.py"
|
||||
|
||||
# online serving
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
export VLLM_CPU_KVCACHE_SPACE=10
|
||||
export VLLM_CPU_OMP_THREADS_BIND=$1
|
||||
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half &
|
||||
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
|
||||
python3 benchmarks/benchmark_serving.py \
|
||||
VLLM_CPU_CI_ENV=0 python3 benchmarks/benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--dataset-name random \
|
||||
--model facebook/opt-125m \
|
||||
@ -83,7 +90,7 @@ function cpu_tests() {
|
||||
--tokenizer facebook/opt-125m"
|
||||
|
||||
# Run multi-lora tests
|
||||
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
|
||||
docker exec cpu-test-"$NUMA_NODE" bash -c "
|
||||
set -e
|
||||
pytest -s -v \
|
||||
tests/lora/test_qwen2vl.py"
|
||||
@ -91,4 +98,4 @@ function cpu_tests() {
|
||||
|
||||
# All of CPU tests are expected to be finished less than 40 mins.
|
||||
export -f cpu_tests
|
||||
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"
|
||||
timeout 1h bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
|
||||
|
||||
@ -54,10 +54,11 @@ docker run --rm -it --device=/dev/neuron0 --network bridge \
|
||||
--name "${container_name}" \
|
||||
${image_name} \
|
||||
/bin/bash -c "
|
||||
set -e; # Exit on first error
|
||||
python3 /workspace/vllm/examples/offline_inference/neuron.py;
|
||||
python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys;
|
||||
for f in /workspace/vllm/tests/neuron/2_core/*.py; do
|
||||
echo 'Running test file: '$f;
|
||||
echo \"Running test file: \$f\";
|
||||
python3 -m pytest \$f -v --capture=tee-sys;
|
||||
done
|
||||
"
|
||||
@ -2,102 +2,186 @@
|
||||
|
||||
set -xu
|
||||
|
||||
|
||||
remove_docker_container() {
|
||||
docker rm -f tpu-test || true;
|
||||
docker rm -f vllm-tpu || true;
|
||||
}
|
||||
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
# Remove the container that might not be cleaned up in the previous run.
|
||||
remove_docker_container
|
||||
|
||||
# Build the docker image.
|
||||
docker build -f docker/Dockerfile.tpu -t vllm-tpu .
|
||||
|
||||
# Set up cleanup.
|
||||
remove_docker_container() { docker rm -f tpu-test || true; }
|
||||
trap remove_docker_container EXIT
|
||||
# Remove the container that might not be cleaned up in the previous run.
|
||||
remove_docker_container
|
||||
cleanup_docker() {
|
||||
# Get Docker's root directory
|
||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||
if [ -z "$docker_root" ]; then
|
||||
echo "Failed to determine Docker root directory."
|
||||
exit 1
|
||||
fi
|
||||
echo "Docker root directory: $docker_root"
|
||||
# Check disk usage of the filesystem where Docker's root directory is located
|
||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||
# Define the threshold
|
||||
threshold=70
|
||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||
# Remove dangling images (those that are not tagged and not used by any container)
|
||||
docker image prune -f
|
||||
# Remove unused volumes / force the system prune for old images as well.
|
||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||
echo "Docker images and volumes cleanup completed."
|
||||
else
|
||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||
fi
|
||||
}
|
||||
cleanup_docker
|
||||
|
||||
# For HF_TOKEN.
|
||||
source /etc/environment
|
||||
# Run a simple end-to-end example.
|
||||
|
||||
docker run --privileged --net host --shm-size=16G -it \
|
||||
-e "HF_TOKEN=$HF_TOKEN" --name tpu-test \
|
||||
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install pytest pytest-asyncio tpu-info \
|
||||
&& python3 -m pip install lm_eval[api]==0.4.4 \
|
||||
&& export VLLM_XLA_CACHE_PATH= \
|
||||
&& export VLLM_USE_V1=1 \
|
||||
&& export VLLM_XLA_CHECK_RECOMPILATION=1 \
|
||||
&& echo HARDWARE \
|
||||
&& tpu-info \
|
||||
&& { \
|
||||
echo TEST_0: Running test_perf.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_perf.py; \
|
||||
echo TEST_0_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_1: Running test_compilation.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py; \
|
||||
echo TEST_1_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_2: Running test_basic.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py; \
|
||||
echo TEST_2_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_3: Running test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine; \
|
||||
echo TEST_3_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_4: Running test_quantization_accuracy.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py; \
|
||||
echo TEST_4_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_5: Running examples/offline_inference/tpu.py; \
|
||||
python3 /workspace/vllm/examples/offline_inference/tpu.py; \
|
||||
echo TEST_5_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_6: Running test_tpu_model_runner.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/worker/test_tpu_model_runner.py; \
|
||||
echo TEST_6_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_7: Running test_sampler.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py; \
|
||||
echo TEST_7_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_8: Running test_topk_topp_sampler.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py; \
|
||||
echo TEST_8_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_9: Running test_multimodal.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py; \
|
||||
echo TEST_9_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_10: Running test_pallas.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py; \
|
||||
echo TEST_10_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_11: Running test_struct_output_generate.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py; \
|
||||
echo TEST_11_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
{ \
|
||||
echo TEST_12: Running test_moe_pallas.py; \
|
||||
python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py; \
|
||||
echo TEST_12_EXIT_CODE: \$?; \
|
||||
} & \
|
||||
# Disable the TPU LoRA tests until the feature is activated
|
||||
# & { \
|
||||
# echo TEST_13: Running test_moe_pallas.py; \
|
||||
# python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/; \
|
||||
# echo TEST_13_EXIT_CODE: \$?; \
|
||||
# } & \
|
||||
wait \
|
||||
&& echo 'All tests have attempted to run. Check logs for individual test statuses and exit codes.' \
|
||||
"
|
||||
vllm-tpu /bin/bash -c '
|
||||
set -e # Exit immediately if a command exits with a non-zero status.
|
||||
set -u # Treat unset variables as an error.
|
||||
|
||||
echo "--- Starting script inside Docker container ---"
|
||||
|
||||
# Create results directory
|
||||
RESULTS_DIR=$(mktemp -d)
|
||||
# If mktemp fails, set -e will cause the script to exit.
|
||||
echo "Results will be stored in: $RESULTS_DIR"
|
||||
|
||||
# Install dependencies
|
||||
echo "--- Installing Python dependencies ---"
|
||||
python3 -m pip install --progress-bar off git+https://github.com/thuml/depyf.git \
|
||||
&& python3 -m pip install --progress-bar off pytest pytest-asyncio tpu-info \
|
||||
&& python3 -m pip install --progress-bar off lm_eval[api]==0.4.4
|
||||
echo "--- Python dependencies installed ---"
|
||||
export VLLM_USE_V1=1
|
||||
export VLLM_XLA_CHECK_RECOMPILATION=1
|
||||
export VLLM_XLA_CACHE_PATH=
|
||||
echo "Using VLLM V1"
|
||||
|
||||
echo "--- Hardware Information ---"
|
||||
tpu-info
|
||||
echo "--- Starting Tests ---"
|
||||
set +e
|
||||
overall_script_exit_code=0
|
||||
|
||||
# --- Test Definitions ---
|
||||
# If a test fails, this function will print logs and will not cause the main script to exit.
|
||||
run_test() {
|
||||
local test_num=$1
|
||||
local test_name=$2
|
||||
local test_command=$3
|
||||
local log_file="$RESULTS_DIR/test_${test_num}.log"
|
||||
local actual_exit_code
|
||||
|
||||
echo "--- TEST_$test_num: Running $test_name ---"
|
||||
|
||||
# Execute the test command.
|
||||
eval "$test_command" > >(tee -a "$log_file") 2> >(tee -a "$log_file" >&2)
|
||||
actual_exit_code=$?
|
||||
|
||||
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" # This goes to main log
|
||||
echo "TEST_${test_num}_COMMAND_EXIT_CODE: $actual_exit_code" >> "$log_file" # Also to per-test log
|
||||
|
||||
if [ "$actual_exit_code" -ne 0 ]; then
|
||||
echo "TEST_$test_num ($test_name) FAILED with exit code $actual_exit_code." >&2
|
||||
echo "--- Log for failed TEST_$test_num ($test_name) ---" >&2
|
||||
if [ -f "$log_file" ]; then
|
||||
cat "$log_file" >&2
|
||||
else
|
||||
echo "Log file $log_file not found for TEST_$test_num ($test_name)." >&2
|
||||
fi
|
||||
echo "--- End of log for TEST_$test_num ($test_name) ---" >&2
|
||||
return "$actual_exit_code" # Return the failure code
|
||||
else
|
||||
echo "TEST_$test_num ($test_name) PASSED."
|
||||
return 0 # Return success
|
||||
fi
|
||||
}
|
||||
|
||||
# Helper function to call run_test and update the overall script exit code
|
||||
run_and_track_test() {
|
||||
local test_num_arg="$1"
|
||||
local test_name_arg="$2"
|
||||
local test_command_arg="$3"
|
||||
|
||||
# Run the test
|
||||
run_test "$test_num_arg" "$test_name_arg" "$test_command_arg"
|
||||
local test_specific_exit_code=$?
|
||||
|
||||
# If the test failed, set the overall script exit code to 1
|
||||
if [ "$test_specific_exit_code" -ne 0 ]; then
|
||||
# No need for extra echo here, run_test already logged the failure.
|
||||
overall_script_exit_code=1
|
||||
fi
|
||||
}
|
||||
|
||||
# --- Actual Test Execution ---
|
||||
run_and_track_test 0 "test_perf.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_perf.py"
|
||||
run_and_track_test 1 "test_compilation.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_compilation.py"
|
||||
run_and_track_test 2 "test_basic.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
|
||||
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
|
||||
run_and_track_test 4 "test_quantization_accuracy.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
|
||||
run_and_track_test 5 "examples/offline_inference/tpu.py" \
|
||||
"python3 /workspace/vllm/examples/offline_inference/tpu.py"
|
||||
run_and_track_test 6 "test_tpu_model_runner.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/worker/test_tpu_model_runner.py"
|
||||
run_and_track_test 7 "test_sampler.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_sampler.py"
|
||||
run_and_track_test 8 "test_topk_topp_sampler.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_topk_topp_sampler.py"
|
||||
run_and_track_test 9 "test_multimodal.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_multimodal.py"
|
||||
run_and_track_test 10 "test_pallas.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_pallas.py"
|
||||
run_and_track_test 11 "test_struct_output_generate.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
|
||||
run_and_track_test 12 "test_moe_pallas.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
|
||||
run_and_track_test 13 "test_lora.py" \
|
||||
"VLLM_XLA_CHECK_RECOMPILATION=0 python3 -m pytest -s -v /workspace/vllm/tests/tpu/lora/test_lora.py"
|
||||
run_and_track_test 14 "test_tpu_qkv_linear.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_qkv_linear.py"
|
||||
run_and_track_test 15 "test_spmd_model_weight_loading.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
|
||||
run_and_track_test 16 "test_kv_cache_update_kernel.py" \
|
||||
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
|
||||
|
||||
# After all tests have been attempted, exit with the overall status.
|
||||
if [ "$overall_script_exit_code" -ne 0 ]; then
|
||||
echo "--- One or more tests FAILED. Overall script exiting with failure code 1. ---"
|
||||
else
|
||||
echo "--- All tests have completed and PASSED. Overall script exiting with success code 0. ---"
|
||||
fi
|
||||
exit "$overall_script_exit_code"
|
||||
' # IMPORTANT: This is the closing single quote for the bash -c "..." command. Ensure it is present and correct.
|
||||
|
||||
# Capture the exit code of the docker run command
|
||||
DOCKER_RUN_EXIT_CODE=$?
|
||||
|
||||
# The trap will run for cleanup.
|
||||
# Exit the main script with the Docker run command's exit code.
|
||||
if [ "$DOCKER_RUN_EXIT_CODE" -ne 0 ]; then
|
||||
echo "Docker run command failed with exit code $DOCKER_RUN_EXIT_CODE."
|
||||
exit "$DOCKER_RUN_EXIT_CODE"
|
||||
else
|
||||
echo "Docker run command completed successfully."
|
||||
exit 0
|
||||
fi
|
||||
# TODO: This test fails because it uses RANDOM_SEED sampling
|
||||
# && VLLM_USE_V1=1 pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||
# pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
|
||||
|
||||
@ -28,4 +28,5 @@ docker run \
|
||||
sh -c '
|
||||
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
|
||||
VLLM_USE_V1=0 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
|
||||
VLLM_USE_V1=1 python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m --block-size 64 --enforce-eager
|
||||
'
|
||||
|
||||
18
.buildkite/scripts/rerun-test.sh
Normal file
18
.buildkite/scripts/rerun-test.sh
Normal file
@ -0,0 +1,18 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Usage: ./rerun_test.sh path/to/test.py::test_name
|
||||
|
||||
# Check if argument is given
|
||||
if [ $# -lt 1 ]; then
|
||||
echo "Usage: $0 path/to/test.py::test_name"
|
||||
echo "Example: $0 tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
TEST=$1
|
||||
COUNT=1
|
||||
|
||||
while pytest -sv "$TEST"; do
|
||||
COUNT=$((COUNT + 1))
|
||||
echo "RUN NUMBER ${COUNT}"
|
||||
done
|
||||
24
.buildkite/scripts/tpu/cleanup_docker.sh
Executable file
24
.buildkite/scripts/tpu/cleanup_docker.sh
Executable file
@ -0,0 +1,24 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||
if [ -z "$docker_root" ]; then
|
||||
echo "Failed to determine Docker root directory."
|
||||
exit 1
|
||||
fi
|
||||
echo "Docker root directory: $docker_root"
|
||||
# Check disk usage of the filesystem where Docker's root directory is located
|
||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||
# Define the threshold
|
||||
threshold=70
|
||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||
# Remove dangling images (those that are not tagged and not used by any container)
|
||||
docker image prune -f
|
||||
# Remove unused volumes / force the system prune for old images as well.
|
||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||
echo "Docker images and volumes cleanup completed."
|
||||
else
|
||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||
fi
|
||||
14
.buildkite/scripts/tpu/config_v6e_1.env
Normal file
14
.buildkite/scripts/tpu/config_v6e_1.env
Normal file
@ -0,0 +1,14 @@
|
||||
# Environment config
|
||||
TEST_NAME=llama8b
|
||||
CONTAINER_NAME=vllm-tpu
|
||||
|
||||
# vllm config
|
||||
MODEL=meta-llama/Llama-3.1-8B-Instruct
|
||||
MAX_NUM_SEQS=256
|
||||
MAX_NUM_BATCHED_TOKENS=1024
|
||||
TENSOR_PARALLEL_SIZE=1
|
||||
MAX_MODEL_LEN=2048
|
||||
DOWNLOAD_DIR=/mnt/disks/persist
|
||||
EXPECTED_THROUGHPUT=8.0
|
||||
INPUT_LEN=1800
|
||||
OUTPUT_LEN=128
|
||||
102
.buildkite/scripts/tpu/docker_run_bm.sh
Executable file
102
.buildkite/scripts/tpu/docker_run_bm.sh
Executable file
@ -0,0 +1,102 @@
|
||||
#!/bin/bash
|
||||
|
||||
if [ ! -f "$1" ]; then
|
||||
echo "Error: The env file '$1' does not exist."
|
||||
exit 1 # Exit the script with a non-zero status to indicate an error
|
||||
fi
|
||||
|
||||
ENV_FILE=$1
|
||||
|
||||
# For testing on local vm, use `set -a` to export all variables
|
||||
source /etc/environment
|
||||
source $ENV_FILE
|
||||
|
||||
remove_docker_container() {
|
||||
docker rm -f tpu-test || true;
|
||||
docker rm -f vllm-tpu || true;
|
||||
docker rm -f $CONTAINER_NAME || true;
|
||||
}
|
||||
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
# Remove the container that might not be cleaned up in the previous run.
|
||||
remove_docker_container
|
||||
|
||||
# Build docker image.
|
||||
# TODO: build the image outside the script and share the image with other
|
||||
# tpu test if building time is too long.
|
||||
DOCKER_BUILDKIT=1 docker build \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg GIT_REPO_CHECK=0 \
|
||||
--tag vllm/vllm-tpu-bm \
|
||||
--progress plain -f docker/Dockerfile.tpu .
|
||||
|
||||
LOG_ROOT=$(mktemp -d)
|
||||
# If mktemp fails, set -e will cause the script to exit.
|
||||
echo "Results will be stored in: $LOG_ROOT"
|
||||
|
||||
if [ -z "$HF_TOKEN" ]; then
|
||||
echo "Error: HF_TOKEN is not set or is empty."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Make sure mounted disk or dir exists
|
||||
if [ ! -d "$DOWNLOAD_DIR" ]; then
|
||||
echo "Error: Folder $DOWNLOAD_DIR does not exist. This is useually a mounted drive. If no mounted drive, just create a folder."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Run model $MODEL"
|
||||
echo
|
||||
|
||||
echo "starting docker...$CONTAINER_NAME"
|
||||
echo
|
||||
docker run \
|
||||
-v $DOWNLOAD_DIR:$DOWNLOAD_DIR \
|
||||
--env-file $ENV_FILE \
|
||||
-e HF_TOKEN="$HF_TOKEN" \
|
||||
-e TARGET_COMMIT=$BUILDKITE_COMMIT \
|
||||
-e MODEL=$MODEL \
|
||||
-e WORKSPACE=/workspace \
|
||||
--name $CONTAINER_NAME \
|
||||
-d \
|
||||
--privileged \
|
||||
--network host \
|
||||
-v /dev/shm:/dev/shm \
|
||||
vllm/vllm-tpu-bm tail -f /dev/null
|
||||
|
||||
echo "run script..."
|
||||
echo
|
||||
docker exec "$CONTAINER_NAME" /bin/bash -c ".buildkite/scripts/tpu/run_bm.sh"
|
||||
|
||||
echo "copy result back..."
|
||||
VLLM_LOG="$LOG_ROOT/$TEST_NAME"_vllm_log.txt
|
||||
BM_LOG="$LOG_ROOT/$TEST_NAME"_bm_log.txt
|
||||
docker cp "$CONTAINER_NAME:/workspace/vllm_log.txt" "$VLLM_LOG"
|
||||
docker cp "$CONTAINER_NAME:/workspace/bm_log.txt" "$BM_LOG"
|
||||
|
||||
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
|
||||
echo "throughput for $TEST_NAME at $BUILDKITE_COMMIT: $throughput"
|
||||
|
||||
if [ "$BUILDKITE" = "true" ]; then
|
||||
echo "Running inside Buildkite"
|
||||
buildkite-agent artifact upload "$VLLM_LOG"
|
||||
buildkite-agent artifact upload "$BM_LOG"
|
||||
else
|
||||
echo "Not running inside Buildkite"
|
||||
fi
|
||||
|
||||
#
|
||||
# compare the throughput with EXPECTED_THROUGHPUT
|
||||
# and assert meeting the expectation
|
||||
#
|
||||
if [[ -z "$throughput" || ! "$throughput" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
|
||||
echo "Failed to get the throughput"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if (( $(echo "$throughput < $EXPECTED_THROUGHPUT" | bc -l) )); then
|
||||
echo "Error: throughput($throughput) is less than expected($EXPECTED_THROUGHPUT)"
|
||||
exit 1
|
||||
fi
|
||||
94
.buildkite/scripts/tpu/run_bm.sh
Executable file
94
.buildkite/scripts/tpu/run_bm.sh
Executable file
@ -0,0 +1,94 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
VLLM_LOG="$WORKSPACE/vllm_log.txt"
|
||||
BM_LOG="$WORKSPACE/bm_log.txt"
|
||||
|
||||
if [ -n "$TARGET_COMMIT" ]; then
|
||||
head_hash=$(git rev-parse HEAD)
|
||||
if [ "$TARGET_COMMIT" != "$head_hash" ]; then
|
||||
echo "Error: target commit $TARGET_COMMIT does not match HEAD: $head_hash"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "model: $MODEL"
|
||||
echo
|
||||
|
||||
#
|
||||
# create a log folder
|
||||
#
|
||||
mkdir "$WORKSPACE/log"
|
||||
|
||||
# TODO: Move to image building.
|
||||
pip install pandas
|
||||
pip install datasets
|
||||
|
||||
#
|
||||
# create sonnet_4x
|
||||
#
|
||||
echo "Create sonnet_4x.txt"
|
||||
echo "" > benchmarks/sonnet_4x.txt
|
||||
for _ in {1..4}
|
||||
do
|
||||
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
|
||||
done
|
||||
|
||||
#
|
||||
# start vllm service in backend
|
||||
#
|
||||
echo "lanching vllm..."
|
||||
echo "logging to $VLLM_LOG"
|
||||
echo
|
||||
|
||||
VLLM_USE_V1=1 vllm serve $MODEL \
|
||||
--seed 42 \
|
||||
--disable-log-requests \
|
||||
--max-num-seqs $MAX_NUM_SEQS \
|
||||
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
|
||||
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \
|
||||
--no-enable-prefix-caching \
|
||||
--download_dir $DOWNLOAD_DIR \
|
||||
--max-model-len $MAX_MODEL_LEN > "$VLLM_LOG" 2>&1 &
|
||||
|
||||
|
||||
echo "wait for 20 minutes.."
|
||||
echo
|
||||
# sleep 1200
|
||||
# wait for 10 minutes...
|
||||
for i in {1..120}; do
|
||||
# TODO: detect other type of errors.
|
||||
if grep -Fq "raise RuntimeError" "$VLLM_LOG"; then
|
||||
echo "Detected RuntimeError, exiting."
|
||||
exit 1
|
||||
elif grep -Fq "Application startup complete" "$VLLM_LOG"; then
|
||||
echo "Application started"
|
||||
break
|
||||
else
|
||||
echo "wait for 10 seconds..."
|
||||
sleep 10
|
||||
fi
|
||||
done
|
||||
|
||||
#
|
||||
# run test
|
||||
#
|
||||
echo "run benchmark test..."
|
||||
echo "logging to $BM_LOG"
|
||||
echo
|
||||
python benchmarks/benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--model $MODEL \
|
||||
--dataset-name sonnet \
|
||||
--dataset-path benchmarks/sonnet_4x.txt \
|
||||
--sonnet-input-len $INPUT_LEN \
|
||||
--sonnet-output-len $OUTPUT_LEN \
|
||||
--ignore-eos > "$BM_LOG"
|
||||
|
||||
echo "completed..."
|
||||
echo
|
||||
|
||||
throughput=$(grep "Request throughput (req/s):" "$BM_LOG" | sed 's/[^0-9.]//g')
|
||||
echo "throughput: $throughput"
|
||||
echo
|
||||
@ -41,6 +41,16 @@ steps:
|
||||
# TODO: add `--strict` once warnings in docstrings are fixed
|
||||
- mkdocs build
|
||||
|
||||
- label: Pytorch Nightly Dependency Override Check # 2min
|
||||
# if this test fails, it means the nightly torch version is not compatible with some
|
||||
# of the dependencies. Please check the error message and add the package to whitelist
|
||||
# in /vllm/tools/generate_nightly_torch_test.py
|
||||
soft_fail: true
|
||||
source_file_dependencies:
|
||||
- requirements/nightly_torch_test.txt
|
||||
commands:
|
||||
- bash standalone_tests/pytorch_nightly_dependency.sh
|
||||
|
||||
- label: Async Engine, Inputs, Utils, Worker Test # 24min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
@ -89,7 +99,7 @@ steps:
|
||||
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
|
||||
|
||||
- label: Chunked Prefill Test
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/basic_correctness/test_chunked_prefill
|
||||
@ -145,6 +155,7 @@ steps:
|
||||
- examples/offline_inference/rlhf_colocate.py
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
- tests/v1/test_async_llm_dp.py
|
||||
- tests/v1/engine/test_engine_core_client.py
|
||||
commands:
|
||||
# test with tp=2 and external_dp=2
|
||||
- VLLM_USE_V1=0 torchrun --nproc-per-node=4 distributed/test_torchrun_example.py
|
||||
@ -154,6 +165,7 @@ steps:
|
||||
# test with internal dp
|
||||
- python3 ../examples/offline_inference/data_parallel.py
|
||||
- TP_SIZE=2 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||
- pytest -v -s v1/engine/test_engine_core_client.py::test_kv_cache_events_dp
|
||||
- pytest -v -s distributed/test_utils.py
|
||||
- pytest -v -s compile/test_basic_correctness.py
|
||||
- pytest -v -s distributed/test_pynccl.py
|
||||
@ -166,6 +178,23 @@ steps:
|
||||
- VLLM_ALLOW_INSECURE_SERIALIZATION=1 RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
|
||||
- popd
|
||||
|
||||
- label: EPLB Algorithm Test
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/eplb
|
||||
- tests/distributed/test_eplb_algo.py
|
||||
commands:
|
||||
- pytest -v -s distributed/test_eplb_algo.py
|
||||
|
||||
- label: EPLB Execution Test # 5min
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
- vllm/distributed/eplb
|
||||
- tests/distributed/test_eplb_execute.py
|
||||
commands:
|
||||
- pytest -v -s distributed/test_eplb_execute.py
|
||||
|
||||
- label: Metrics, Tracing Test # 10min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
num_gpus: 2
|
||||
@ -175,6 +204,11 @@ steps:
|
||||
- tests/tracing
|
||||
commands:
|
||||
- pytest -v -s metrics
|
||||
- "pip install \
|
||||
'opentelemetry-sdk>=1.26.0' \
|
||||
'opentelemetry-api>=1.26.0' \
|
||||
'opentelemetry-exporter-otlp>=1.26.0' \
|
||||
'opentelemetry-semantic-conventions-ai>=0.4.1'"
|
||||
- pytest -v -s tracing
|
||||
|
||||
##### fast check tests #####
|
||||
@ -199,8 +233,9 @@ steps:
|
||||
- tests/test_sequence
|
||||
- tests/test_config
|
||||
- tests/test_logger
|
||||
- tests/test_vllm_port
|
||||
commands:
|
||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
|
||||
- pytest -v -s engine test_sequence.py test_config.py test_logger.py test_vllm_port.py
|
||||
# OOM in the CI unless we run this separately
|
||||
- pytest -v -s tokenization
|
||||
|
||||
@ -263,6 +298,15 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s prefix_caching
|
||||
|
||||
|
||||
- label: Platform Tests (CUDA)
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
- vllm/
|
||||
- tests/cuda
|
||||
commands:
|
||||
- pytest -v -s cuda/test_cuda_context.py
|
||||
|
||||
- label: Samplers Test # 36min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
@ -274,17 +318,6 @@ steps:
|
||||
- pytest -v -s samplers
|
||||
- VLLM_USE_FLASHINFER_SAMPLER=1 pytest -v -s samplers
|
||||
|
||||
- label: LogitsProcessor Test # 5min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor/layers
|
||||
- vllm/model_executor/guided_decoding
|
||||
- tests/test_logits_processor
|
||||
- tests/model_executor/test_guided_processors
|
||||
commands:
|
||||
- pytest -v -s test_logits_processor.py
|
||||
- pytest -v -s model_executor/test_guided_processors.py
|
||||
|
||||
- label: Speculative decoding tests # 40min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
source_file_dependencies:
|
||||
@ -297,7 +330,7 @@ steps:
|
||||
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
|
||||
|
||||
- label: LoRA Test %N # 15min each
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
source_file_dependencies:
|
||||
- vllm/lora
|
||||
- tests/lora
|
||||
@ -313,6 +346,7 @@ steps:
|
||||
commands:
|
||||
- pytest -v -s compile/test_pass_manager.py
|
||||
- pytest -v -s compile/test_fusion.py
|
||||
- pytest -v -s compile/test_fusion_attn.py
|
||||
- pytest -v -s compile/test_silu_mul_quant_fusion.py
|
||||
- pytest -v -s compile/test_sequence_parallelism.py
|
||||
- pytest -v -s compile/test_async_tp.py
|
||||
@ -328,6 +362,7 @@ steps:
|
||||
# these tests need to be separated, cannot combine
|
||||
- pytest -v -s compile/piecewise/test_simple.py
|
||||
- pytest -v -s compile/piecewise/test_toy_llama.py
|
||||
- pytest -v -s compile/piecewise/test_full_cudagraph.py
|
||||
|
||||
- label: PyTorch Fullgraph Test # 18min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
@ -397,6 +432,17 @@ steps:
|
||||
- pytest -v -s tensorizer_loader
|
||||
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
|
||||
|
||||
- label: Model Executor Test
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
soft_fail: true
|
||||
source_file_dependencies:
|
||||
- vllm/model_executor
|
||||
- tests/model_executor
|
||||
commands:
|
||||
- apt-get update && apt-get install -y curl libsodium23
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s model_executor
|
||||
|
||||
- label: Benchmarks # 9min
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
working_dir: "/vllm-workspace/.buildkite"
|
||||
@ -420,6 +466,9 @@ steps:
|
||||
- vllm/model_executor/layers/quantization
|
||||
- tests/quantization
|
||||
commands:
|
||||
# temporary install here since we need nightly, will move to requirements/test.in
|
||||
# after torchao 0.12 release
|
||||
- pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu126
|
||||
- VLLM_TEST_FORCE_LOAD_FORMAT=auto pytest -v -s quantization
|
||||
|
||||
- label: LM Eval Small Models # 53min
|
||||
@ -593,13 +642,18 @@ steps:
|
||||
- vllm/executor/
|
||||
- vllm/model_executor/models/
|
||||
- tests/distributed/
|
||||
- tests/examples/offline_inference/data_parallel.py
|
||||
commands:
|
||||
- # the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=0 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py
|
||||
- VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py
|
||||
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
|
||||
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed'
|
||||
- NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed'
|
||||
- python3 ../examples/offline_inference/data_parallel.py --dp-size=2 --tp-size=1 --node-size=2 --node-rank=1 --master-addr=192.168.10.10 --master-port=12345 --enforce-eager --trust-remote-code
|
||||
|
||||
- label: Distributed Tests (2 GPUs) # 40min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
@ -617,9 +671,11 @@ steps:
|
||||
- vllm/worker/model_runner.py
|
||||
- entrypoints/llm/test_collective_rpc.py
|
||||
- tests/v1/test_async_llm_dp.py
|
||||
- tests/v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- vllm/v1/engine/
|
||||
commands:
|
||||
- TP_SIZE=1 DP_SIZE=2 pytest -v -s v1/test_async_llm_dp.py
|
||||
- DP_SIZE=2 pytest -v -s v1/entrypoints/openai/test_multi_api_servers.py
|
||||
- pytest -v -s entrypoints/llm/test_collective_rpc.py
|
||||
- pytest -v -s ./compile/test_basic_correctness.py
|
||||
- pytest -v -s ./compile/test_wrapper.py
|
||||
@ -660,7 +716,7 @@ steps:
|
||||
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
|
||||
|
||||
- label: Multi-step Tests (4 GPUs) # 36min
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental, amdproduction]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 4
|
||||
source_file_dependencies:
|
||||
@ -721,7 +777,7 @@ steps:
|
||||
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models.txt
|
||||
|
||||
- label: Weight Loading Multiple GPU Test - Large Models # optional
|
||||
mirror_hardwares: [amdexperimental]
|
||||
mirror_hardwares: [amdexperimental]
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
num_gpus: 2
|
||||
gpu: a100
|
||||
|
||||
20
.github/CODEOWNERS
vendored
20
.github/CODEOWNERS
vendored
@ -10,15 +10,21 @@
|
||||
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
|
||||
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
|
||||
/vllm/model_executor/guided_decoding @mgoin @russellb
|
||||
/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm
|
||||
/vllm/multimodal @DarkLight1337 @ywang96
|
||||
/vllm/vllm_flash_attn @LucasWilkinson
|
||||
/vllm/lora @jeejeelee
|
||||
/vllm/reasoning @aarnphm
|
||||
/vllm/entrypoints @aarnphm
|
||||
CMakeLists.txt @tlrmchlsmth
|
||||
|
||||
# Any change to the VllmConfig changes can have a large user-facing impact,
|
||||
# so spam a lot of people
|
||||
/vllm/config.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor
|
||||
|
||||
# vLLM V1
|
||||
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
|
||||
/vllm/v1/structured_output @mgoin @russellb
|
||||
/vllm/v1/structured_output @mgoin @russellb @aarnphm
|
||||
|
||||
# Test ownership
|
||||
/.buildkite/lm-eval-harness @mgoin @simon-mo
|
||||
@ -27,8 +33,8 @@ CMakeLists.txt @tlrmchlsmth
|
||||
/tests/distributed/test_multi_node_assignment.py @youkaichao
|
||||
/tests/distributed/test_pipeline_parallel.py @youkaichao
|
||||
/tests/distributed/test_same_node.py @youkaichao
|
||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
|
||||
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
|
||||
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
|
||||
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm
|
||||
/tests/kernels @tlrmchlsmth @WoosukKwon
|
||||
/tests/model_executor/test_guided_processors.py @mgoin @russellb
|
||||
/tests/models @DarkLight1337 @ywang96
|
||||
@ -38,11 +44,11 @@ CMakeLists.txt @tlrmchlsmth
|
||||
/tests/quantization @mgoin @robertgshaw2-redhat
|
||||
/tests/spec_decode @njhill @LiuXiaoxuanPKU
|
||||
/tests/test_inputs.py @DarkLight1337 @ywang96
|
||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
|
||||
/tests/v1/structured_output @mgoin @russellb
|
||||
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
|
||||
/tests/v1/structured_output @mgoin @russellb @aarnphm
|
||||
/tests/weight_loading @mgoin @youkaichao
|
||||
/tests/lora @jeejeelee
|
||||
|
||||
# Docs
|
||||
/docs @hmellor
|
||||
mkdocs.yaml @hmellor
|
||||
mkdocs.yaml @hmellor
|
||||
|
||||
10
.github/ISSUE_TEMPLATE/400-bug-report.yml
vendored
10
.github/ISSUE_TEMPLATE/400-bug-report.yml
vendored
@ -8,6 +8,16 @@ body:
|
||||
attributes:
|
||||
value: >
|
||||
#### Before submitting an issue, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/vllm-project/vllm/issues?q=is%3Aissue+sort%3Acreated-desc+).
|
||||
- type: markdown
|
||||
attributes:
|
||||
value: |
|
||||
⚠️ **SECURITY WARNING:** Please review any text you paste to ensure it does not contain sensitive information such as:
|
||||
- API tokens or keys (e.g., Hugging Face tokens, OpenAI API keys)
|
||||
- Passwords or authentication credentials
|
||||
- Private URLs or endpoints
|
||||
- Personal or confidential data
|
||||
|
||||
Consider redacting or replacing sensitive values with placeholders like `<YOUR_TOKEN_HERE>` when sharing configuration or code examples.
|
||||
- type: textarea
|
||||
attributes:
|
||||
label: Your current environment
|
||||
|
||||
16
.github/PULL_REQUEST_TEMPLATE.md
vendored
16
.github/PULL_REQUEST_TEMPLATE.md
vendored
@ -1,6 +1,18 @@
|
||||
FILL IN THE PR DESCRIPTION HERE
|
||||
## Essential Elements of an Effective PR Description Checklist
|
||||
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
|
||||
- [ ] The test plan, such as providing test command.
|
||||
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
|
||||
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
|
||||
|
||||
FIX #xxxx (*link existing issues this PR will resolve*)
|
||||
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
|
||||
|
||||
## Purpose
|
||||
|
||||
## Test Plan
|
||||
|
||||
## Test Result
|
||||
|
||||
## (Optional) Documentation Update
|
||||
|
||||
<!--- pyml disable-next-line no-emphasis-as-heading -->
|
||||
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
|
||||
|
||||
62
.github/mergify.yml
vendored
62
.github/mergify.yml
vendored
@ -36,6 +36,21 @@ pull_request_rules:
|
||||
add:
|
||||
- frontend
|
||||
|
||||
- name: label-llama
|
||||
description: Automatically apply llama label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^examples/.*llama.*\.py
|
||||
- files~=^tests/.*llama.*\.py
|
||||
- files~=^vllm/entrypoints/openai/tool_parsers/llama.*\.py
|
||||
- files~=^vllm/model_executor/models/.*llama.*\.py
|
||||
- files~=^vllm/transformers_utils/configs/.*llama.*\.py
|
||||
- title~=(?i)llama
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- llama
|
||||
|
||||
- name: label-multi-modality
|
||||
description: Automatically apply multi-modality label
|
||||
conditions:
|
||||
@ -51,6 +66,53 @@ pull_request_rules:
|
||||
add:
|
||||
- multi-modality
|
||||
|
||||
- name: label-performance
|
||||
description: Automatically apply performance label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^benchmarks/
|
||||
- files~=^vllm/benchmarks/
|
||||
- files~=^tests/benchmarks/
|
||||
- files~=^\.buildkite/nightly-benchmarks/
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- performance
|
||||
|
||||
- name: label-qwen
|
||||
description: Automatically apply qwen label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^examples/.*qwen.*\.py
|
||||
- files~=^tests/.*qwen.*\.py
|
||||
- files~=^vllm/model_executor/models/.*qwen.*\.py
|
||||
- files~=^vllm/reasoning/.*qwen.*\.py
|
||||
- title~=(?i)Qwen
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- qwen
|
||||
|
||||
- name: label-rocm
|
||||
description: Automatically apply rocm label
|
||||
conditions:
|
||||
- or:
|
||||
- files~=^csrc/rocm/
|
||||
- files~=^docker/Dockerfile.rocm
|
||||
- files~=^requirements/rocm.*\.txt
|
||||
- files~=^vllm/attention/backends/rocm.*\.py
|
||||
- files~=^vllm/attention/ops/rocm.*\.py
|
||||
- files~=^vllm/model_executor/layers/fused_moe/rocm.*\.py
|
||||
- files~=^vllm/v1/attention/backends/mla/rocm.*\.py
|
||||
- files~=^tests/kernels/.*_rocm.*\.py
|
||||
- files=vllm/platforms/rocm.py
|
||||
- title~=(?i)AMD
|
||||
- title~=(?i)ROCm
|
||||
actions:
|
||||
label:
|
||||
add:
|
||||
- rocm
|
||||
|
||||
- name: label-structured-output
|
||||
description: Automatically apply structured-output label
|
||||
conditions:
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@ -200,5 +200,5 @@ benchmarks/**/*.json
|
||||
actionlint
|
||||
shellcheck*/
|
||||
|
||||
# Ingore moe/marlin_moe gen code
|
||||
# Ignore moe/marlin_moe gen code
|
||||
csrc/moe/marlin_moe_wna16/kernel_*
|
||||
|
||||
@ -11,6 +11,8 @@ repos:
|
||||
hooks:
|
||||
- id: yapf
|
||||
args: [--in-place, --verbose]
|
||||
# Keep the same list from yapfignore here to avoid yapf failing without any inputs
|
||||
exclude: '(.buildkite|benchmarks|build|examples)/.*'
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.11.7
|
||||
hooks:
|
||||
@ -18,12 +20,10 @@ repos:
|
||||
args: [--output-format, github, --fix]
|
||||
- id: ruff-format
|
||||
files: ^(.buildkite|benchmarks|examples)/.*
|
||||
- repo: https://github.com/codespell-project/codespell
|
||||
rev: v2.4.1
|
||||
- repo: https://github.com/crate-ci/typos
|
||||
rev: v1.32.0
|
||||
hooks:
|
||||
- id: codespell
|
||||
additional_dependencies: ['tomli']
|
||||
args: ['--toml', 'pyproject.toml']
|
||||
- id: typos
|
||||
- repo: https://github.com/PyCQA/isort
|
||||
rev: 6.0.1
|
||||
hooks:
|
||||
@ -53,12 +53,17 @@ repos:
|
||||
files: ^requirements/test\.(in|txt)$
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: format-torch-nightly-test
|
||||
name: reformat nightly_torch_test.txt to be in sync with test.in
|
||||
language: python
|
||||
entry: python tools/generate_nightly_torch_test.py
|
||||
files: ^requirements/test\.(in|txt)$
|
||||
- id: mypy-local
|
||||
name: Run mypy for local Python installation
|
||||
entry: tools/mypy.sh 0 "local"
|
||||
language: python
|
||||
types: [python]
|
||||
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
|
||||
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests, pydantic]
|
||||
stages: [pre-commit] # Don't run in CI
|
||||
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
|
||||
name: Run mypy for Python 3.9
|
||||
@ -115,6 +120,11 @@ repos:
|
||||
entry: python tools/check_spdx_header.py
|
||||
language: python
|
||||
types: [python]
|
||||
- id: check-root-lazy-imports
|
||||
name: Check root lazy imports
|
||||
entry: python tools/check_init_lazy_imports.py
|
||||
language: python
|
||||
types: [python]
|
||||
- id: check-filenames
|
||||
name: Check for spaces in all filenames
|
||||
entry: bash
|
||||
@ -143,6 +153,13 @@ repos:
|
||||
types: [python]
|
||||
pass_filenames: false
|
||||
additional_dependencies: [regex]
|
||||
- id: check-pickle-imports
|
||||
name: Prevent new pickle/cloudpickle imports
|
||||
entry: python tools/check_pickle_imports.py
|
||||
language: python
|
||||
types: [python]
|
||||
pass_filenames: false
|
||||
additional_dependencies: [pathspec, regex]
|
||||
# Keep `suggestion` last
|
||||
- id: suggestion
|
||||
name: Suggestion
|
||||
|
||||
@ -23,6 +23,9 @@ include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
|
||||
# Suppress potential warnings about unused manually-specified variables
|
||||
set(ignoreMe "${VLLM_PYTHON_PATH}")
|
||||
|
||||
# Prevent installation of dependencies (cutlass) by default.
|
||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
||||
|
||||
#
|
||||
# Supported python versions. These versions will be searched in order, the
|
||||
# first match will be selected. These should be kept in sync with setup.py.
|
||||
@ -179,9 +182,6 @@ include(FetchContent)
|
||||
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
|
||||
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
|
||||
|
||||
#
|
||||
# Set rocm version dev int.
|
||||
#
|
||||
if(VLLM_GPU_LANG STREQUAL "HIP")
|
||||
#
|
||||
# Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
|
||||
@ -189,7 +189,6 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
|
||||
set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
|
||||
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
|
||||
|
||||
|
||||
#
|
||||
# Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
|
||||
# a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
|
||||
@ -243,6 +242,7 @@ set(VLLM_EXT_SRC
|
||||
"csrc/activation_kernels.cu"
|
||||
"csrc/layernorm_kernels.cu"
|
||||
"csrc/layernorm_quant_kernels.cu"
|
||||
"csrc/sampler.cu"
|
||||
"csrc/cuda_view.cu"
|
||||
"csrc/quantization/gptq/q_gemm.cu"
|
||||
"csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
|
||||
@ -308,7 +308,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
|
||||
# are not supported by Machete yet.
|
||||
# 9.0 for latest bf16 atomicAdd PTX
|
||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
|
||||
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
|
||||
if (MARLIN_ARCHS)
|
||||
|
||||
#
|
||||
@ -420,9 +420,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
|
||||
# CUDA 12.8 or later
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
|
||||
# The cutlass_scaled_mm kernels for Blackwell SM100 (c3x, i.e. CUTLASS 3.x)
|
||||
# require CUDA 12.8 or later
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
|
||||
set(SRCS
|
||||
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
|
||||
@ -454,7 +454,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# kernels for the remaining archs that are not already built for 3x.
|
||||
# (Build 8.9 for FP8)
|
||||
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
|
||||
"7.5;8.0;8.9+PTX" "${CUDA_ARCHS}")
|
||||
"7.5;8.0;8.7;8.9+PTX" "${CUDA_ARCHS}")
|
||||
# subtract out the archs that are already built for 3x
|
||||
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
|
||||
if (SCALED_MM_2X_ARCHS)
|
||||
@ -513,6 +513,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
CUDA_ARCHS "${FP4_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
|
||||
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
|
||||
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
|
||||
else()
|
||||
message(STATUS "Not building NVFP4 as no compatible archs were found.")
|
||||
@ -542,13 +543,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
|
||||
# CUTLASS MoE kernels
|
||||
|
||||
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and only works
|
||||
# on Hopper). get_cutlass_moe_mm_data should only be compiled if it's possible
|
||||
# to compile MoE kernels that use its output.
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
|
||||
# The MoE kernel cutlass_moe_mm requires CUDA 12.3 or later (and ONLY works
|
||||
# on Hopper). get_cutlass_(pplx_)moe_mm_data should only be compiled
|
||||
# if it's possible to compile MoE kernels that use its output.
|
||||
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND SCALED_MM_ARCHS)
|
||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu"
|
||||
"csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
|
||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/grouped_mm_c3x.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${SCALED_MM_ARCHS}")
|
||||
@ -566,6 +566,16 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# moe_data.cu is used by all CUTLASS MoE kernels.
|
||||
cuda_archs_loose_intersection(CUTLASS_MOE_DATA_ARCHS "9.0a;10.0a" "${CUDA_ARCHS}")
|
||||
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.3 AND CUTLASS_MOE_DATA_ARCHS)
|
||||
set(SRCS "csrc/quantization/cutlass_w8a8/moe/moe_data.cu")
|
||||
set_gencode_flags_for_srcs(
|
||||
SRCS "${SRCS}"
|
||||
CUDA_ARCHS "${CUTLASS_MOE_DATA_ARCHS}")
|
||||
list(APPEND VLLM_EXT_SRC "${SRCS}")
|
||||
endif()
|
||||
|
||||
#
|
||||
# Machete kernels
|
||||
|
||||
@ -638,6 +648,14 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
# if CUDA endif
|
||||
endif()
|
||||
|
||||
if (VLLM_GPU_LANG STREQUAL "HIP")
|
||||
# Add QuickReduce kernels
|
||||
list(APPEND VLLM_EXT_SRC
|
||||
"csrc/custom_quickreduce.cu"
|
||||
)
|
||||
# if ROCM endif
|
||||
endif()
|
||||
|
||||
message(STATUS "Enabling C extension.")
|
||||
define_gpu_extension_target(
|
||||
_C
|
||||
@ -684,7 +702,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
|
||||
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
|
||||
# 9.0 for latest bf16 atomicAdd PTX
|
||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;9.0+PTX" "${CUDA_ARCHS}")
|
||||
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.7;9.0+PTX" "${CUDA_ARCHS}")
|
||||
if (MARLIN_MOE_ARCHS)
|
||||
|
||||
#
|
||||
@ -785,5 +803,7 @@ endif()
|
||||
# For CUDA we also build and ship some external projects.
|
||||
if (VLLM_GPU_LANG STREQUAL "CUDA")
|
||||
include(cmake/external_projects/flashmla.cmake)
|
||||
|
||||
# vllm-flash-attn should be last as it overwrites some CMake functions
|
||||
include(cmake/external_projects/vllm_flash_attn.cmake)
|
||||
endif ()
|
||||
|
||||
14
README.md
14
README.md
@ -58,8 +58,8 @@ vLLM is fast with:
|
||||
- Efficient management of attention key and value memory with [**PagedAttention**](https://blog.vllm.ai/2023/06/20/vllm.html)
|
||||
- Continuous batching of incoming requests
|
||||
- Fast model execution with CUDA/HIP graph
|
||||
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516),INT4, INT8, and FP8.
|
||||
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
|
||||
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [AutoRound](https://arxiv.org/abs/2309.05516), INT4, INT8, and FP8
|
||||
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer
|
||||
- Speculative decoding
|
||||
- Chunked prefill
|
||||
|
||||
@ -72,14 +72,14 @@ vLLM is flexible and easy to use with:
|
||||
- Tensor parallelism and pipeline parallelism support for distributed inference
|
||||
- Streaming outputs
|
||||
- OpenAI-compatible API server
|
||||
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
|
||||
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron
|
||||
- Prefix caching support
|
||||
- Multi-LoRA support
|
||||
|
||||
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
|
||||
- Transformer-like LLMs (e.g., Llama)
|
||||
- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
|
||||
- Embedding Models (e.g. E5-Mistral)
|
||||
- Embedding Models (e.g., E5-Mistral)
|
||||
- Multi-modal LLMs (e.g., LLaVA)
|
||||
|
||||
Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
|
||||
@ -154,12 +154,14 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
|
||||
|
||||
## Contact Us
|
||||
|
||||
<!-- --8<-- [start:contact-us] -->
|
||||
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
|
||||
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
|
||||
- coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
|
||||
- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
|
||||
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature
|
||||
- For collaborations and partnerships, please contact us at [vllm-questions@lists.berkeley.edu](mailto:vllm-questions@lists.berkeley.edu)
|
||||
<!-- --8<-- [end:contact-us] -->
|
||||
|
||||
## Media Kit
|
||||
|
||||
- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
|
||||
- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit)
|
||||
|
||||
@ -8,4 +8,6 @@ Please report security issues privately using [the vulnerability submission form
|
||||
|
||||
---
|
||||
|
||||
Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
|
||||
|
||||
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
|
||||
|
||||
@ -4,7 +4,7 @@ This README guides you through running benchmark tests with the extensive
|
||||
datasets supported on vLLM. It’s a living document, updated as new features and datasets
|
||||
become available.
|
||||
|
||||
## Dataset Overview
|
||||
**Dataset Overview**
|
||||
|
||||
<table style="width:100%; border-collapse: collapse;">
|
||||
<thead>
|
||||
@ -64,6 +64,12 @@ become available.
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td><code>lmms-lab/LLaVA-OneVision-Data</code>, <code>Aeala/ShareGPT_Vicuna_unfiltered</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td><strong>Custom</strong></td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td style="text-align: center;">✅</td>
|
||||
<td>Local file: <code>data.jsonl</code></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
@ -76,7 +82,10 @@ become available.
|
||||
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
|
||||
|
||||
---
|
||||
## Example - Online Benchmark
|
||||
<details>
|
||||
<summary><b>🚀 Example - Online Benchmark</b></summary>
|
||||
|
||||
<br/>
|
||||
|
||||
First start serving your model
|
||||
|
||||
@ -124,7 +133,40 @@ P99 ITL (ms): 8.39
|
||||
==================================================
|
||||
```
|
||||
|
||||
### VisionArena Benchmark for Vision Language Models
|
||||
**Custom Dataset**
|
||||
|
||||
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
|
||||
|
||||
```
|
||||
{"prompt": "What is the capital of India?"}
|
||||
{"prompt": "What is the capital of Iran?"}
|
||||
{"prompt": "What is the capital of China?"}
|
||||
```
|
||||
|
||||
```bash
|
||||
# start server
|
||||
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
|
||||
```
|
||||
|
||||
```bash
|
||||
# run benchmarking script
|
||||
python3 benchmarks/benchmark_serving.py --port 9001 --save-result --save-detailed \
|
||||
--backend vllm \
|
||||
--model meta-llama/Llama-3.1-8B-Instruct \
|
||||
--endpoint /v1/completions \
|
||||
--dataset-name custom \
|
||||
--dataset-path <path-to-your-data-jsonl> \
|
||||
--custom-skip-chat-template \
|
||||
--num-prompts 80 \
|
||||
--max-concurrency 1 \
|
||||
--temperature=0.3 \
|
||||
--top-p=0.75 \
|
||||
--result-dir "./log/"
|
||||
```
|
||||
|
||||
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
|
||||
|
||||
**VisionArena Benchmark for Vision Language Models**
|
||||
|
||||
```bash
|
||||
# need a model with vision capability here
|
||||
@ -142,13 +184,13 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
||||
--num-prompts 1000
|
||||
```
|
||||
|
||||
### InstructCoder Benchmark with Speculative Decoding
|
||||
**InstructCoder Benchmark with Speculative Decoding**
|
||||
|
||||
``` bash
|
||||
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
|
||||
--ngram_prompt_lookup_min 2 \
|
||||
--ngram-prompt-lookup-max 5 \
|
||||
--speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5}
|
||||
--speculative-config $'{"method": "ngram",
|
||||
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
||||
"prompt_lookup_min": 2}'
|
||||
```
|
||||
|
||||
``` bash
|
||||
@ -159,7 +201,7 @@ python3 benchmarks/benchmark_serving.py \
|
||||
--num-prompts 2048
|
||||
```
|
||||
|
||||
### Other HuggingFaceDataset Examples
|
||||
**Other HuggingFaceDataset Examples**
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
|
||||
@ -203,7 +245,17 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
||||
--seed 42
|
||||
```
|
||||
|
||||
### Running With Sampling Parameters
|
||||
**`philschmid/mt-bench`**
|
||||
|
||||
``` bash
|
||||
python3 vllm/benchmarks/benchmark_serving.py \
|
||||
--model Qwen/QwQ-32B \
|
||||
--dataset-name hf \
|
||||
--dataset-path philschmid/mt-bench \
|
||||
--num-prompts 80
|
||||
```
|
||||
|
||||
**Running With Sampling Parameters**
|
||||
|
||||
When using OpenAI-compatible backends such as `vllm`, optional sampling
|
||||
parameters can be specified. Example client command:
|
||||
@ -221,8 +273,27 @@ python3 vllm/benchmarks/benchmark_serving.py \
|
||||
--num-prompts 10
|
||||
```
|
||||
|
||||
---
|
||||
## Example - Offline Throughput Benchmark
|
||||
**Running With Ramp-Up Request Rate**
|
||||
|
||||
The benchmark tool also supports ramping up the request rate over the
|
||||
duration of the benchmark run. This can be useful for stress testing the
|
||||
server or finding the maximum throughput that it can handle, given some latency budget.
|
||||
|
||||
Two ramp-up strategies are supported:
|
||||
- `linear`: Increases the request rate linearly from a start value to an end value.
|
||||
- `exponential`: Increases the request rate exponentially.
|
||||
|
||||
The following arguments can be used to control the ramp-up:
|
||||
- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
|
||||
- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
|
||||
- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><b>📈 Example - Offline Throughput Benchmark</b></summary>
|
||||
|
||||
<br/>
|
||||
|
||||
```bash
|
||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
@ -240,7 +311,7 @@ Total num prompt tokens: 5014
|
||||
Total num output tokens: 1500
|
||||
```
|
||||
|
||||
### VisionArena Benchmark for Vision Language Models
|
||||
**VisionArena Benchmark for Vision Language Models**
|
||||
|
||||
``` bash
|
||||
python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
@ -260,7 +331,7 @@ Total num prompt tokens: 14527
|
||||
Total num output tokens: 1280
|
||||
```
|
||||
|
||||
### InstructCoder Benchmark with Speculative Decoding
|
||||
**InstructCoder Benchmark with Speculative Decoding**
|
||||
|
||||
``` bash
|
||||
VLLM_WORKER_MULTIPROC_METHOD=spawn \
|
||||
@ -273,9 +344,9 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
--output-len=100 \
|
||||
--num-prompts=2048 \
|
||||
--async-engine \
|
||||
--ngram_prompt_lookup_min=2 \
|
||||
--ngram-prompt-lookup-max=5 \
|
||||
--speculative_config '{"model": "[ngram]", "num_speculative_tokens": 5}
|
||||
--speculative-config $'{"method": "ngram",
|
||||
"num_speculative_tokens": 5, "prompt_lookup_max": 5,
|
||||
"prompt_lookup_min": 2}'
|
||||
```
|
||||
|
||||
```
|
||||
@ -284,7 +355,7 @@ Total num prompt tokens: 261136
|
||||
Total num output tokens: 204800
|
||||
```
|
||||
|
||||
### Other HuggingFaceDataset Examples
|
||||
**Other HuggingFaceDataset Examples**
|
||||
|
||||
**`lmms-lab/LLaVA-OneVision-Data`**
|
||||
|
||||
@ -323,7 +394,7 @@ python3 benchmarks/benchmark_throughput.py \
|
||||
--num-prompts 10
|
||||
```
|
||||
|
||||
### Benchmark with LoRA Adapters
|
||||
**Benchmark with LoRA Adapters**
|
||||
|
||||
``` bash
|
||||
# download dataset
|
||||
@ -339,3 +410,196 @@ python3 vllm/benchmarks/benchmark_throughput.py \
|
||||
--enable-lora \
|
||||
--lora-path yard1/llama-2-7b-sql-lora-test
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><b>🛠️ Example - Structured Output Benchmark</b></summary>
|
||||
|
||||
<br/>
|
||||
|
||||
Benchmark the performance of structured output generation (JSON, grammar, regex).
|
||||
|
||||
**Server Setup**
|
||||
|
||||
```bash
|
||||
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
|
||||
```
|
||||
|
||||
**JSON Schema Benchmark**
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||
--backend vllm \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--dataset json \
|
||||
--structured-output-ratio 1.0 \
|
||||
--request-rate 10 \
|
||||
--num-prompts 1000
|
||||
```
|
||||
|
||||
**Grammar-based Generation Benchmark**
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||
--backend vllm \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--dataset grammar \
|
||||
--structure-type grammar \
|
||||
--request-rate 10 \
|
||||
--num-prompts 1000
|
||||
```
|
||||
|
||||
**Regex-based Generation Benchmark**
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||
--backend vllm \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--dataset regex \
|
||||
--request-rate 10 \
|
||||
--num-prompts 1000
|
||||
```
|
||||
|
||||
**Choice-based Generation Benchmark**
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||
--backend vllm \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--dataset choice \
|
||||
--request-rate 10 \
|
||||
--num-prompts 1000
|
||||
```
|
||||
|
||||
**XGrammar Benchmark Dataset**
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_serving_structured_output.py \
|
||||
--backend vllm \
|
||||
--model NousResearch/Hermes-3-Llama-3.1-8B \
|
||||
--dataset xgrammar_bench \
|
||||
--request-rate 10 \
|
||||
--num-prompts 1000
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><b>📚 Example - Long Document QA Benchmark</b></summary>
|
||||
|
||||
<br/>
|
||||
|
||||
Benchmark the performance of long document question-answering with prefix caching.
|
||||
|
||||
**Basic Long Document QA Test**
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
||||
--model meta-llama/Llama-2-7b-chat-hf \
|
||||
--enable-prefix-caching \
|
||||
--num-documents 16 \
|
||||
--document-length 2000 \
|
||||
--output-len 50 \
|
||||
--repeat-count 5
|
||||
```
|
||||
|
||||
**Different Repeat Modes**
|
||||
|
||||
```bash
|
||||
# Random mode (default) - shuffle prompts randomly
|
||||
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
||||
--model meta-llama/Llama-2-7b-chat-hf \
|
||||
--enable-prefix-caching \
|
||||
--num-documents 8 \
|
||||
--document-length 3000 \
|
||||
--repeat-count 3 \
|
||||
--repeat-mode random
|
||||
|
||||
# Tile mode - repeat entire prompt list in sequence
|
||||
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
||||
--model meta-llama/Llama-2-7b-chat-hf \
|
||||
--enable-prefix-caching \
|
||||
--num-documents 8 \
|
||||
--document-length 3000 \
|
||||
--repeat-count 3 \
|
||||
--repeat-mode tile
|
||||
|
||||
# Interleave mode - repeat each prompt consecutively
|
||||
python3 benchmarks/benchmark_long_document_qa_throughput.py \
|
||||
--model meta-llama/Llama-2-7b-chat-hf \
|
||||
--enable-prefix-caching \
|
||||
--num-documents 8 \
|
||||
--document-length 3000 \
|
||||
--repeat-count 3 \
|
||||
--repeat-mode interleave
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><b>🗂️ Example - Prefix Caching Benchmark</b></summary>
|
||||
|
||||
<br/>
|
||||
|
||||
Benchmark the efficiency of automatic prefix caching.
|
||||
|
||||
**Fixed Prompt with Prefix Caching**
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_prefix_caching.py \
|
||||
--model meta-llama/Llama-2-7b-chat-hf \
|
||||
--enable-prefix-caching \
|
||||
--num-prompts 1 \
|
||||
--repeat-count 100 \
|
||||
--input-length-range 128:256
|
||||
```
|
||||
|
||||
**ShareGPT Dataset with Prefix Caching**
|
||||
|
||||
```bash
|
||||
# download dataset
|
||||
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
|
||||
|
||||
python3 benchmarks/benchmark_prefix_caching.py \
|
||||
--model meta-llama/Llama-2-7b-chat-hf \
|
||||
--dataset-path /path/ShareGPT_V3_unfiltered_cleaned_split.json \
|
||||
--enable-prefix-caching \
|
||||
--num-prompts 20 \
|
||||
--repeat-count 5 \
|
||||
--input-length-range 128:256
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
<details>
|
||||
<summary><b>⚡ Example - Request Prioritization Benchmark</b></summary>
|
||||
|
||||
<br/>
|
||||
|
||||
Benchmark the performance of request prioritization in vLLM.
|
||||
|
||||
**Basic Prioritization Test**
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_prioritization.py \
|
||||
--model meta-llama/Llama-2-7b-chat-hf \
|
||||
--input-len 128 \
|
||||
--output-len 64 \
|
||||
--num-prompts 100 \
|
||||
--scheduling-policy priority
|
||||
```
|
||||
|
||||
**Multiple Sequences per Prompt**
|
||||
|
||||
```bash
|
||||
python3 benchmarks/benchmark_prioritization.py \
|
||||
--model meta-llama/Llama-2-7b-chat-hf \
|
||||
--input-len 128 \
|
||||
--output-len 64 \
|
||||
--num-prompts 100 \
|
||||
--scheduling-policy priority \
|
||||
--n 2
|
||||
```
|
||||
|
||||
</details>
|
||||
|
||||
@ -10,11 +10,16 @@
|
||||
# 3. Set variables (ALL REQUIRED)
|
||||
# BASE: your directory for vllm repo
|
||||
# MODEL: the model served by vllm
|
||||
# SYSTEM: the hardware, choice TPU or GPU, for other systems, "get best profile" might not support.
|
||||
# TP: ways of tensor parallelism
|
||||
# DOWNLOAD_DIR: directory to download and load model weights.
|
||||
# INPUT_LEN: request input len
|
||||
# OUTPUT_LEN: request output len
|
||||
# MIN_CACHE_HIT_PCT: prefix cache rate
|
||||
# MAX_LATENCY_ALLOWED_MS: (e2e) latency requirement. If there's no latency requirement, set it to a large number like 1000000000
|
||||
# NUM_SEQS_LIST: a list of `max-num-seqs` you want to loop with.
|
||||
# NUM_BATCHED_TOKENS_LIST: a list of `max-num-batched-tokens` you want to loop with.
|
||||
# Note that the default NUM_SEQS_LIST and NUM_BATCHED_TOKENS_LIST are set for medium size input/output len, for extra short context (such as 20:20), you might need to include larger numbers in NUM_SEQS_LIST.
|
||||
# 4. Run the script, it might take a long time, you can use tmux to avoid the script stop if disconnection happens.
|
||||
# 5. The final result will be saved in RESULT file.
|
||||
|
||||
@ -30,31 +35,31 @@
|
||||
TAG=$(date +"%Y_%m_%d_%H_%M")
|
||||
BASE=""
|
||||
MODEL="meta-llama/Llama-3.1-8B-Instruct"
|
||||
SYSTEM="TPU"
|
||||
TP=1
|
||||
DOWNLOAD_DIR=""
|
||||
INPUT_LEN=4000
|
||||
OUTPUT_LEN=16
|
||||
MIN_CACHE_HIT_PCT_PCT=0
|
||||
MIN_CACHE_HIT_PCT=0
|
||||
MAX_LATENCY_ALLOWED_MS=100000000000
|
||||
NUM_SEQS_LIST="128 256"
|
||||
NUM_BATCHED_TOKENS_LIST="512 1024 2048 4096"
|
||||
|
||||
LOG_FOLDER="$BASE/auto-benchmark/$TAG"
|
||||
RESULT="$LOG_FOLDER/result.txt"
|
||||
PROFILE_PATH="$LOG_FOLDER/profile"
|
||||
|
||||
echo "result file$ $RESULT"
|
||||
echo "result file: $RESULT"
|
||||
echo "model: $MODEL"
|
||||
echo
|
||||
|
||||
rm -rf $LOG_FOLDER
|
||||
rm -rf $PROFILE_PATH
|
||||
mkdir -p $LOG_FOLDER
|
||||
mkdir -p $PROFILE_PATH
|
||||
|
||||
cd "$BASE/vllm"
|
||||
# create sonnet-4x.txt so that we can sample 2048 tokens for input
|
||||
echo "" > benchmarks/sonnet_4x.txt
|
||||
for _ in {1..4}
|
||||
do
|
||||
cat benchmarks/sonnet.txt >> benchmarks/sonnet_4x.txt
|
||||
done
|
||||
|
||||
pip install datasets
|
||||
pip install -q datasets
|
||||
|
||||
current_hash=$(git rev-parse HEAD)
|
||||
echo "hash:$current_hash" >> "$RESULT"
|
||||
@ -64,53 +69,88 @@ best_throughput=0
|
||||
best_max_num_seqs=0
|
||||
best_num_batched_tokens=0
|
||||
best_goodput=0
|
||||
run_benchmark() {
|
||||
local max_num_seqs=$1
|
||||
local max_num_batched_tokens=$2
|
||||
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
||||
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
|
||||
echo "vllm_log: $vllm_log"
|
||||
echo
|
||||
rm -f $vllm_log
|
||||
|
||||
# start the server
|
||||
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 vllm serve $MODEL \
|
||||
start_server() {
|
||||
local gpu_memory_utilization=$1
|
||||
local max_num_seqs=$2
|
||||
local max_num_batched_tokens=$3
|
||||
local vllm_log=$4
|
||||
local profile_dir=$5
|
||||
|
||||
pkill -f vllm
|
||||
|
||||
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
|
||||
--disable-log-requests \
|
||||
--port 8004 \
|
||||
--gpu-memory-utilization 0.98 \
|
||||
--gpu-memory-utilization $gpu_memory_utilization \
|
||||
--max-num-seqs $max_num_seqs \
|
||||
--max-num-batched-tokens $max_num_batched_tokens \
|
||||
--tensor-parallel-size 1 \
|
||||
--tensor-parallel-size $TP \
|
||||
--enable-prefix-caching \
|
||||
--load-format dummy \
|
||||
--download-dir $DOWNLOAD_DIR \
|
||||
--download-dir "$DOWNLOAD_DIR" \
|
||||
--max-model-len $(( INPUT_LEN+OUTPUT_LEN )) > "$vllm_log" 2>&1 &
|
||||
echo "wait for 10 minutes.."
|
||||
echo
|
||||
|
||||
# wait for 10 minutes...
|
||||
server_started=0
|
||||
for i in {1..60}; do
|
||||
if grep -Fq "Application startup complete" "$vllm_log"; then
|
||||
echo "Application started"
|
||||
for i in {1..60}; do
|
||||
RESPONSE=$(curl -s -X GET "http://0.0.0.0:8004/health" -w "%{http_code}" -o /dev/stdout)
|
||||
STATUS_CODE=$(echo "$RESPONSE" | tail -n 1)
|
||||
if [[ "$STATUS_CODE" -eq 200 ]]; then
|
||||
server_started=1
|
||||
break
|
||||
else
|
||||
# echo "wait for 10 seconds..."
|
||||
sleep 10
|
||||
fi
|
||||
done
|
||||
|
||||
if (( ! server_started )); then
|
||||
echo "server did not start within 10 minutes, terminate the benchmarking. Please check server log at $vllm_log"
|
||||
echo "pkill -f vllm"
|
||||
echo
|
||||
pkill vllm
|
||||
sleep 10
|
||||
echo "server did not start within 10 minutes. Please check server log at $vllm_log".
|
||||
return 1
|
||||
else
|
||||
return 0
|
||||
fi
|
||||
}
|
||||
|
||||
update_best_profile() {
|
||||
local profile_dir=$1
|
||||
local profile_index=$2
|
||||
sorted_paths=($(find "$profile_dir" -maxdepth 1 -not -path "$profile_dir" | sort))
|
||||
selected_profile_file=
|
||||
if [[ "$SYSTEM" == "TPU" ]]; then
|
||||
selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
|
||||
fi
|
||||
if [[ "$SYSTEM" == "GPU" ]]; then
|
||||
selected_profile_file="${sorted_paths[$profile_index]}"
|
||||
fi
|
||||
rm -f $PROFILE_PATH/*
|
||||
cp $selected_profile_file $PROFILE_PATH
|
||||
}
|
||||
|
||||
run_benchmark() {
|
||||
local max_num_seqs=$1
|
||||
local max_num_batched_tokens=$2
|
||||
local gpu_memory_utilization=$3
|
||||
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
||||
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
|
||||
local profile_dir="$LOG_FOLDER/profile_${max_num_seqs}_${max_num_batched_tokens}"
|
||||
echo "vllm_log: $vllm_log"
|
||||
echo
|
||||
rm -f $vllm_log
|
||||
mkdir -p $profile_dir
|
||||
pkill -f vllm
|
||||
local profile_index=0
|
||||
|
||||
echo "starting server..."
|
||||
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log $profile_dir
|
||||
result=$?
|
||||
if [[ "$result" -eq 1 ]]; then
|
||||
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
|
||||
else
|
||||
echo "server started."
|
||||
fi
|
||||
echo
|
||||
|
||||
echo "run benchmark test..."
|
||||
echo
|
||||
meet_latency_requirement=0
|
||||
# get a basic qps by using request-rate inf
|
||||
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
|
||||
@ -118,30 +158,32 @@ run_benchmark() {
|
||||
python benchmarks/benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--model $MODEL \
|
||||
--dataset-name sonnet \
|
||||
--dataset-path benchmarks/sonnet_4x.txt \
|
||||
--sonnet-input-len $INPUT_LEN \
|
||||
--sonnet-output-len $OUTPUT_LEN \
|
||||
--dataset-name random \
|
||||
--random-input-len $INPUT_LEN \
|
||||
--random-output-len $OUTPUT_LEN \
|
||||
--ignore-eos \
|
||||
--disable-tqdm \
|
||||
--request-rate inf \
|
||||
--percentile-metrics ttft,tpot,itl,e2el \
|
||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||
--num-prompts 100 \
|
||||
--sonnet-prefix-len $prefix_len \
|
||||
--port 8004 > "$bm_log"
|
||||
through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||
--num-prompts 1000 \
|
||||
--random-prefix-len $prefix_len \
|
||||
--port 8004 \
|
||||
--profile &> "$bm_log"
|
||||
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
||||
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||
|
||||
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
|
||||
meet_latency_requirement=1
|
||||
request_rate=inf
|
||||
fi
|
||||
|
||||
if (( ! meet_latency_requirement )); then
|
||||
# start from request-rate as int(through_put) + 1
|
||||
request_rate=$((${through_put%.*} + 1))
|
||||
# start from request-rate as int(throughput) + 1
|
||||
request_rate=$((${throughput%.*} + 1))
|
||||
while ((request_rate > 0)); do
|
||||
profile_index=$((profile_index+1))
|
||||
# clear prefix cache
|
||||
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
|
||||
sleep 5
|
||||
@ -149,19 +191,18 @@ run_benchmark() {
|
||||
python benchmarks/benchmark_serving.py \
|
||||
--backend vllm \
|
||||
--model $MODEL \
|
||||
--dataset-name sonnet \
|
||||
--dataset-path benchmarks/sonnet_4x.txt \
|
||||
--sonnet-input-len $INPUT_LEN \
|
||||
--sonnet-output-len $OUTPUT_LEN \
|
||||
--ignore_eos \
|
||||
--dataset-name random \
|
||||
--random-input-len $INPUT_LEN \
|
||||
--random-output-len $OUTPUT_LEN \
|
||||
--ignore-eos \
|
||||
--disable-tqdm \
|
||||
--request-rate $request_rate \
|
||||
--percentile-metrics ttft,tpot,itl,e2el \
|
||||
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
|
||||
--num-prompts 100 \
|
||||
--sonnet-prefix-len $prefix_len \
|
||||
--port 8004 > "$bm_log"
|
||||
through_put=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||
--random-prefix-len $prefix_len \
|
||||
--port 8004 &> "$bm_log"
|
||||
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
|
||||
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
|
||||
if (( $(echo "$e2el <= $MAX_LATENCY_ALLOWED_MS" | bc -l) )); then
|
||||
@ -173,13 +214,19 @@ run_benchmark() {
|
||||
fi
|
||||
# write the results and update the best result.
|
||||
if ((meet_latency_requirement)); then
|
||||
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput"
|
||||
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, through put: $through_put, goodput: $goodput" >> "$RESULT"
|
||||
if (( $(echo "$through_put > $best_throughput" | bc -l) )); then
|
||||
best_throughput=$through_put
|
||||
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput"
|
||||
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens, request_rate: $request_rate, e2el: $e2el, throughput: $throughput, goodput: $goodput" >> "$RESULT"
|
||||
if (( $(echo "$throughput > $best_throughput" | bc -l) )); then
|
||||
best_throughput=$throughput
|
||||
best_max_num_seqs=$max_num_seqs
|
||||
best_num_batched_tokens=$max_num_batched_tokens
|
||||
best_goodput=$goodput
|
||||
if [[ "$SYSTEM" == "TPU" ]]; then
|
||||
update_best_profile "$profile_dir/plugins/profile" $profile_index
|
||||
fi
|
||||
if [[ "$SYSTEM" == "GPU" ]]; then
|
||||
update_best_profile "$profile_dir" $profile_index
|
||||
fi
|
||||
fi
|
||||
else
|
||||
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
|
||||
@ -188,25 +235,42 @@ run_benchmark() {
|
||||
|
||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
||||
|
||||
echo "pkill -f vllm"
|
||||
echo
|
||||
pkill vllm
|
||||
sleep 10
|
||||
rm -f $vllm_log
|
||||
printf '=%.0s' $(seq 1 20)
|
||||
return 0
|
||||
}
|
||||
|
||||
read -r -a num_seqs_list <<< "$NUM_SEQS_LIST"
|
||||
read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"
|
||||
|
||||
num_seqs_list="128 256"
|
||||
num_batched_tokens_list="512 1024 2048 4096"
|
||||
for num_seqs in $num_seqs_list; do
|
||||
for num_batched_tokens in $num_batched_tokens_list; do
|
||||
run_benchmark $num_seqs $num_batched_tokens
|
||||
exit 0
|
||||
# first find out the max gpu-memory-utilization without HBM OOM.
|
||||
gpu_memory_utilization=0.98
|
||||
find_gpu_memory_utilization=0
|
||||
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
|
||||
start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log"
|
||||
result=$?
|
||||
if [[ "$result" -eq 0 ]]; then
|
||||
find_gpu_memory_utilization=1
|
||||
break
|
||||
else
|
||||
gpu_memory_utilization=$(echo "$gpu_memory_utilization - 0.01" | bc)
|
||||
fi
|
||||
done
|
||||
|
||||
if [[ "$find_gpu_memory_utilization" -eq 1 ]]; then
|
||||
echo "Using gpu_memory_utilization=$gpu_memory_utilization to serve model."
|
||||
else
|
||||
echo "Cannot find a proper gpu_memory_utilization over 0.9 to serve the model, please check logs in $LOG_FOLDER."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for num_seqs in "${num_seqs_list[@]}"; do
|
||||
for num_batched_tokens in "${num_batched_tokens_list[@]}"; do
|
||||
run_benchmark $num_seqs $num_batched_tokens $gpu_memory_utilization
|
||||
done
|
||||
done
|
||||
echo "finish permutations"
|
||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
|
||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput" >> "$RESULT"
|
||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
|
||||
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"
|
||||
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import io
|
||||
import json
|
||||
@ -324,7 +325,7 @@ async def async_request_openai_completions(
|
||||
|
||||
most_recent_timestamp = timestamp
|
||||
generated_text += text or ""
|
||||
elif usage := data.get("usage"):
|
||||
if usage := data.get("usage"):
|
||||
output.output_tokens = usage.get("completion_tokens")
|
||||
if first_chunk_received:
|
||||
output.success = True
|
||||
@ -403,8 +404,14 @@ async def async_request_openai_chat_completions(
|
||||
chunk_bytes = chunk_bytes.strip()
|
||||
if not chunk_bytes:
|
||||
continue
|
||||
chunk_bytes = chunk_bytes.decode("utf-8")
|
||||
# NOTE: SSE comments (often used as pings) start with a colon.
|
||||
# These are not JSON data payload and should be skipped.
|
||||
if chunk_bytes.startswith(":"):
|
||||
continue
|
||||
|
||||
chunk = chunk_bytes.removeprefix("data: ")
|
||||
|
||||
chunk = chunk_bytes.decode("utf-8").removeprefix("data: ")
|
||||
if chunk != "[DONE]":
|
||||
timestamp = time.perf_counter()
|
||||
data = json.loads(chunk)
|
||||
@ -611,6 +618,7 @@ ASYNC_REQUEST_FUNCS = {
|
||||
"tensorrt-llm": async_request_trt_llm,
|
||||
"scalellm": async_request_openai_completions,
|
||||
"sglang": async_request_openai_completions,
|
||||
"llama.cpp": async_request_openai_completions,
|
||||
}
|
||||
|
||||
OPENAI_COMPATIBLE_BACKENDS = [
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
This module defines a framework for sampling benchmark requests from various
|
||||
datasets. Each dataset subclass of BenchmarkDataset must implement sample
|
||||
@ -9,9 +10,6 @@ generation. Supported dataset types include:
|
||||
- BurstGPT
|
||||
- HuggingFace
|
||||
- VisionArena
|
||||
|
||||
TODO: Implement CustomDataset to parse a JSON file and convert its contents into
|
||||
SampleRequest instances, similar to the approach used in ShareGPT.
|
||||
"""
|
||||
|
||||
import base64
|
||||
@ -351,11 +349,12 @@ class RandomDataset(BenchmarkDataset):
|
||||
# [1650, 939, 486] -> ['Ġcall', 'sh', 'ere']
|
||||
# To avoid uncontrolled change of the prompt length,
|
||||
# the encoded sequence is truncated before being decode again.
|
||||
total_input_len = prefix_len + int(input_lens[i])
|
||||
re_encoded_sequence = tokenizer.encode(prompt, add_special_tokens=False)[
|
||||
: input_lens[i]
|
||||
:total_input_len
|
||||
]
|
||||
prompt = tokenizer.decode(re_encoded_sequence)
|
||||
total_input_len = prefix_len + int(input_lens[i])
|
||||
total_input_len = len(re_encoded_sequence)
|
||||
requests.append(
|
||||
SampleRequest(
|
||||
prompt=prompt,
|
||||
@ -442,6 +441,97 @@ class ShareGPTDataset(BenchmarkDataset):
|
||||
return samples
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Custom Dataset Implementation
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
class CustomDataset(BenchmarkDataset):
|
||||
"""
|
||||
Implements the Custom dataset. Loads data from a JSONL file and generates
|
||||
sample requests based on conversation turns. E.g.,
|
||||
```
|
||||
{"prompt": "What is the capital of India?"}
|
||||
{"prompt": "What is the capital of Iran?"}
|
||||
{"prompt": "What is the capital of China?"}
|
||||
```
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs) -> None:
|
||||
super().__init__(**kwargs)
|
||||
self.load_data()
|
||||
|
||||
def load_data(self) -> None:
|
||||
if self.dataset_path is None:
|
||||
raise ValueError("dataset_path must be provided for loading data.")
|
||||
|
||||
# self.data will be a list of dictionaries
|
||||
# e.g., [{"prompt": "What is the capital of India?"}, ...]
|
||||
# This will be the standardized format which load_data()
|
||||
# has to convert into depending on the filetype of dataset_path.
|
||||
# sample() will assume this standardized format of self.data
|
||||
self.data = []
|
||||
|
||||
# Load the JSONL file
|
||||
if self.dataset_path.endswith(".jsonl"):
|
||||
jsonl_data = pd.read_json(path_or_buf=self.dataset_path, lines=True)
|
||||
|
||||
# check if the JSONL file has a 'prompt' column
|
||||
if "prompt" not in jsonl_data.columns:
|
||||
raise ValueError("JSONL file must contain a 'prompt' column.")
|
||||
|
||||
# Convert each row to a dictionary and append to self.data
|
||||
# This will convert the DataFrame to a list of dictionaries
|
||||
# where each dictionary corresponds to a row in the DataFrame.
|
||||
# This is the standardized format we want for self.data
|
||||
for _, row in jsonl_data.iterrows():
|
||||
self.data.append(row.to_dict())
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"Only JSONL format is supported for CustomDataset."
|
||||
)
|
||||
|
||||
random.seed(self.random_seed)
|
||||
random.shuffle(self.data)
|
||||
|
||||
def sample(
|
||||
self,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
num_requests: int,
|
||||
lora_path: Optional[str] = None,
|
||||
max_loras: Optional[int] = None,
|
||||
output_len: Optional[int] = None,
|
||||
enable_multimodal_chat: bool = False,
|
||||
skip_chat_template: bool = False,
|
||||
**kwargs,
|
||||
) -> list:
|
||||
sampled_requests = []
|
||||
for item in self.data:
|
||||
if len(sampled_requests) >= num_requests:
|
||||
break
|
||||
prompt = item["prompt"]
|
||||
|
||||
# apply template
|
||||
if not skip_chat_template:
|
||||
prompt = tokenizer.apply_chat_template(
|
||||
[{"role": "user", "content": prompt}],
|
||||
add_generation_prompt=True,
|
||||
tokenize=False,
|
||||
)
|
||||
|
||||
prompt_len = len(tokenizer(prompt).input_ids)
|
||||
sampled_requests.append(
|
||||
SampleRequest(
|
||||
prompt=prompt,
|
||||
prompt_len=prompt_len,
|
||||
expected_output_len=output_len,
|
||||
)
|
||||
)
|
||||
self.maybe_oversample_requests(sampled_requests, num_requests)
|
||||
|
||||
return sampled_requests
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Sonnet Dataset Implementation
|
||||
# -----------------------------------------------------------------------------
|
||||
@ -776,7 +866,15 @@ class InstructCoderDataset(HuggingFaceDataset):
|
||||
for item in self.data:
|
||||
if len(sampled_requests) >= num_requests:
|
||||
break
|
||||
prompt = f"{item['instruction']}:\n{item['input']}"
|
||||
prompt = f"{item['input']}\n\n{item['instruction']} Just output \
|
||||
the code, do not include any explanation."
|
||||
|
||||
# apply template
|
||||
prompt = tokenizer.apply_chat_template(
|
||||
[{"role": "user", "content": prompt}],
|
||||
add_generation_prompt=True,
|
||||
tokenize=False,
|
||||
)
|
||||
prompt_len = len(tokenizer(prompt).input_ids)
|
||||
sampled_requests.append(
|
||||
SampleRequest(
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Benchmark the latency of processing a single batch of requests."""
|
||||
|
||||
import argparse
|
||||
@ -6,13 +7,12 @@ import dataclasses
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Any, Optional
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
import vllm.envs as envs
|
||||
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.engine.arg_utils import EngineArgs
|
||||
@ -80,17 +80,9 @@ def main(args: argparse.Namespace):
|
||||
|
||||
def run_to_completion(profile_dir: Optional[str] = None):
|
||||
if profile_dir:
|
||||
with torch.profiler.profile(
|
||||
activities=[
|
||||
torch.profiler.ProfilerActivity.CPU,
|
||||
torch.profiler.ProfilerActivity.CUDA,
|
||||
],
|
||||
on_trace_ready=torch.profiler.tensorboard_trace_handler(
|
||||
str(profile_dir)
|
||||
),
|
||||
) as p:
|
||||
llm_generate()
|
||||
print(p.key_averages().table(sort_by="self_cuda_time_total"))
|
||||
llm.start_profile()
|
||||
llm_generate()
|
||||
llm.stop_profile()
|
||||
else:
|
||||
start_time = time.perf_counter()
|
||||
llm_generate()
|
||||
@ -103,11 +95,7 @@ def main(args: argparse.Namespace):
|
||||
run_to_completion(profile_dir=None)
|
||||
|
||||
if args.profile:
|
||||
profile_dir = args.profile_result_dir
|
||||
if not profile_dir:
|
||||
profile_dir = (
|
||||
Path(".") / "vllm_benchmark_result" / f"latency_result_{time.time()}"
|
||||
)
|
||||
profile_dir = envs.VLLM_TORCH_PROFILER_DIR
|
||||
print(f"Profiling (results will be saved to '{profile_dir}')...")
|
||||
run_to_completion(profile_dir=profile_dir)
|
||||
return
|
||||
@ -135,7 +123,7 @@ def main(args: argparse.Namespace):
|
||||
save_to_pytorch_benchmark_format(args, results)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
def create_argument_parser():
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the latency of processing a single batch of "
|
||||
"requests till completion."
|
||||
@ -164,15 +152,6 @@ if __name__ == "__main__":
|
||||
action="store_true",
|
||||
help="profile the generation process of a single batch",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--profile-result-dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"path to save the pytorch profiler output. Can be visualized "
|
||||
"with ui.perfetto.dev or Tensorboard."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-json",
|
||||
type=str,
|
||||
@ -192,5 +171,16 @@ if __name__ == "__main__":
|
||||
# V1 enables prefix caching by default which skews the latency
|
||||
# numbers. We need to disable prefix caching by default.
|
||||
parser.set_defaults(enable_prefix_caching=False)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = create_argument_parser()
|
||||
args = parser.parse_args()
|
||||
if args.profile and not envs.VLLM_TORCH_PROFILER_DIR:
|
||||
raise OSError(
|
||||
"The environment variable 'VLLM_TORCH_PROFILER_DIR' is not set. "
|
||||
"Please set it to a valid path to use torch profiler."
|
||||
)
|
||||
main(args)
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Offline benchmark to test the long document QA throughput.
|
||||
|
||||
@ -141,7 +142,7 @@ def main(args):
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
def create_argument_parser():
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the performance with or "
|
||||
"without automatic prefix caching."
|
||||
@ -191,5 +192,11 @@ if __name__ == "__main__":
|
||||
)
|
||||
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = create_argument_parser()
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Benchmark the efficiency of prefix caching.
|
||||
|
||||
@ -217,7 +218,7 @@ def main(args):
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
def create_argument_parser():
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the performance with or without "
|
||||
"automatic prefix caching."
|
||||
@ -267,5 +268,11 @@ if __name__ == "__main__":
|
||||
)
|
||||
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = create_argument_parser()
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Benchmark offline prioritization."""
|
||||
|
||||
import argparse
|
||||
@ -160,7 +161,7 @@ def main(args: argparse.Namespace):
|
||||
json.dump(results, f, indent=4)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
def create_argument_parser():
|
||||
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
|
||||
parser.add_argument(
|
||||
"--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm"
|
||||
@ -203,6 +204,12 @@ if __name__ == "__main__":
|
||||
)
|
||||
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = create_argument_parser()
|
||||
args = parser.parse_args()
|
||||
if args.tokenizer is None:
|
||||
args.tokenizer = args.model
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
r"""Benchmark online serving throughput.
|
||||
|
||||
On the server side, run one of the following commands:
|
||||
@ -32,7 +33,7 @@ import warnings
|
||||
from collections.abc import AsyncGenerator, Iterable
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import Any, Optional
|
||||
from typing import Any, Literal, Optional
|
||||
|
||||
import numpy as np
|
||||
from tqdm.asyncio import tqdm
|
||||
@ -60,6 +61,7 @@ from benchmark_dataset import (
|
||||
ASRDataset,
|
||||
BurstGPTDataset,
|
||||
ConversationDataset,
|
||||
CustomDataset,
|
||||
HuggingFaceDataset,
|
||||
InstructCoderDataset,
|
||||
MTBenchDataset,
|
||||
@ -105,14 +107,42 @@ class BenchmarkMetrics:
|
||||
percentiles_e2el_ms: list[tuple[float, float]]
|
||||
|
||||
|
||||
def _get_current_request_rate(
|
||||
ramp_up_strategy: Optional[Literal["linear", "exponential"]],
|
||||
ramp_up_start_rps: Optional[int],
|
||||
ramp_up_end_rps: Optional[int],
|
||||
request_index: int,
|
||||
total_requests: int,
|
||||
request_rate: float,
|
||||
) -> float:
|
||||
if (
|
||||
ramp_up_strategy
|
||||
and ramp_up_start_rps is not None
|
||||
and ramp_up_end_rps is not None
|
||||
):
|
||||
progress = request_index / max(total_requests - 1, 1)
|
||||
if ramp_up_strategy == "linear":
|
||||
increase = (ramp_up_end_rps - ramp_up_start_rps) * progress
|
||||
return ramp_up_start_rps + increase
|
||||
elif ramp_up_strategy == "exponential":
|
||||
ratio = ramp_up_end_rps / ramp_up_start_rps
|
||||
return ramp_up_start_rps * (ratio**progress)
|
||||
else:
|
||||
raise ValueError(f"Unknown ramp-up strategy: {ramp_up_strategy}")
|
||||
return request_rate
|
||||
|
||||
|
||||
async def get_request(
|
||||
input_requests: list[SampleRequest],
|
||||
request_rate: float,
|
||||
burstiness: float = 1.0,
|
||||
) -> AsyncGenerator[SampleRequest, None]:
|
||||
ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
|
||||
ramp_up_start_rps: Optional[int] = None,
|
||||
ramp_up_end_rps: Optional[int] = None,
|
||||
) -> AsyncGenerator[tuple[SampleRequest, float], None]:
|
||||
"""
|
||||
Asynchronously generates requests at a specified rate
|
||||
with OPTIONAL burstiness.
|
||||
with OPTIONAL burstiness and OPTIONAL ramp-up strategy.
|
||||
|
||||
Args:
|
||||
input_requests:
|
||||
@ -127,22 +157,44 @@ async def get_request(
|
||||
A lower burstiness value (0 < burstiness < 1) results
|
||||
in more bursty requests, while a higher burstiness value
|
||||
(burstiness > 1) results in a more uniform arrival of requests.
|
||||
ramp_up_strategy (optional):
|
||||
The ramp-up strategy. Can be "linear" or "exponential".
|
||||
If None, uses constant request rate (specified by request_rate).
|
||||
ramp_up_start_rps (optional):
|
||||
The starting request rate for ramp-up.
|
||||
ramp_up_end_rps (optional):
|
||||
The ending request rate for ramp-up.
|
||||
"""
|
||||
input_requests: Iterable[SampleRequest] = iter(input_requests)
|
||||
|
||||
# Calculate scale parameter theta to maintain the desired request_rate.
|
||||
assert burstiness > 0, (
|
||||
f"A positive burstiness factor is expected, but given {burstiness}."
|
||||
)
|
||||
theta = 1.0 / (request_rate * burstiness)
|
||||
# Convert to list to get length for ramp-up calculations
|
||||
if isinstance(input_requests, Iterable) and not isinstance(input_requests, list):
|
||||
input_requests = list(input_requests)
|
||||
|
||||
total_requests = len(input_requests)
|
||||
request_index = 0
|
||||
|
||||
for request in input_requests:
|
||||
yield request
|
||||
current_request_rate = _get_current_request_rate(
|
||||
ramp_up_strategy,
|
||||
ramp_up_start_rps,
|
||||
ramp_up_end_rps,
|
||||
request_index,
|
||||
total_requests,
|
||||
request_rate,
|
||||
)
|
||||
|
||||
if request_rate == float("inf"):
|
||||
yield request, current_request_rate
|
||||
|
||||
request_index += 1
|
||||
|
||||
if current_request_rate == float("inf"):
|
||||
# If the request rate is infinity, then we don't need to wait.
|
||||
continue
|
||||
|
||||
theta = 1.0 / (current_request_rate * burstiness)
|
||||
|
||||
# Sample the request interval from the gamma distribution.
|
||||
# If burstiness is 1, it follows exponential distribution.
|
||||
interval = np.random.gamma(shape=burstiness, scale=theta)
|
||||
@ -288,6 +340,9 @@ async def benchmark(
|
||||
max_concurrency: Optional[int],
|
||||
lora_modules: Optional[Iterable[str]],
|
||||
extra_body: Optional[dict],
|
||||
ramp_up_strategy: Optional[Literal["linear", "exponential"]] = None,
|
||||
ramp_up_start_rps: Optional[int] = None,
|
||||
ramp_up_end_rps: Optional[int] = None,
|
||||
):
|
||||
if backend in ASYNC_REQUEST_FUNCS:
|
||||
request_func = ASYNC_REQUEST_FUNCS[backend]
|
||||
@ -351,7 +406,15 @@ async def benchmark(
|
||||
|
||||
distribution = "Poisson process" if burstiness == 1.0 else "Gamma distribution"
|
||||
|
||||
print(f"Traffic request rate: {request_rate}")
|
||||
if ramp_up_strategy is not None:
|
||||
print(
|
||||
f"Traffic ramp-up strategy: {ramp_up_strategy}. Will increase "
|
||||
f"RPS from {ramp_up_start_rps} to {ramp_up_end_rps} RPS over "
|
||||
"the duration of the benchmark."
|
||||
)
|
||||
else:
|
||||
print(f"Traffic request rate: {request_rate} RPS.")
|
||||
|
||||
print(f"Burstiness factor: {burstiness} ({distribution})")
|
||||
print(f"Maximum request concurrency: {max_concurrency}")
|
||||
|
||||
@ -371,7 +434,34 @@ async def benchmark(
|
||||
|
||||
benchmark_start_time = time.perf_counter()
|
||||
tasks: list[asyncio.Task] = []
|
||||
async for request in get_request(input_requests, request_rate, burstiness):
|
||||
|
||||
rps_change_events = []
|
||||
last_int_rps = -1
|
||||
if ramp_up_strategy is not None and ramp_up_start_rps is not None:
|
||||
last_int_rps = ramp_up_start_rps
|
||||
rps_change_events.append(
|
||||
{
|
||||
"rps": last_int_rps,
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
)
|
||||
|
||||
async for request, current_request_rate in get_request(
|
||||
input_requests,
|
||||
request_rate,
|
||||
burstiness,
|
||||
ramp_up_strategy,
|
||||
ramp_up_start_rps,
|
||||
ramp_up_end_rps,
|
||||
):
|
||||
if ramp_up_strategy is not None:
|
||||
current_int_rps = int(current_request_rate)
|
||||
if current_int_rps > last_int_rps:
|
||||
timestamp = datetime.now().isoformat()
|
||||
for rps_val in range(last_int_rps + 1, current_int_rps + 1):
|
||||
rps_change_events.append({"rps": rps_val, "timestamp": timestamp})
|
||||
last_int_rps = current_int_rps
|
||||
|
||||
prompt, prompt_len, output_len, mm_content = (
|
||||
request.prompt,
|
||||
request.prompt_len,
|
||||
@ -395,11 +485,8 @@ async def benchmark(
|
||||
ignore_eos=ignore_eos,
|
||||
extra_body=extra_body,
|
||||
)
|
||||
tasks.append(
|
||||
asyncio.create_task(
|
||||
limited_request_func(request_func_input=request_func_input, pbar=pbar)
|
||||
)
|
||||
)
|
||||
task = limited_request_func(request_func_input=request_func_input, pbar=pbar)
|
||||
tasks.append(asyncio.create_task(task))
|
||||
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
|
||||
|
||||
if profile:
|
||||
@ -475,6 +562,9 @@ async def benchmark(
|
||||
"errors": [output.error for output in outputs],
|
||||
}
|
||||
|
||||
if rps_change_events:
|
||||
result["rps_change_events"] = rps_change_events
|
||||
|
||||
def process_one_metric(
|
||||
# E.g., "ttft"
|
||||
metric_attribute_name: str,
|
||||
@ -608,6 +698,26 @@ def main(args: argparse.Namespace):
|
||||
tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
|
||||
tokenizer_mode = args.tokenizer_mode
|
||||
|
||||
# Validate ramp-up arguments
|
||||
if args.ramp_up_strategy is not None:
|
||||
if args.request_rate != float("inf"):
|
||||
raise ValueError(
|
||||
"When using ramp-up, do not specify --request-rate. "
|
||||
"The request rate will be controlled by ramp-up parameters. "
|
||||
"Please remove the --request-rate argument."
|
||||
)
|
||||
if args.ramp_up_start_rps is None or args.ramp_up_end_rps is None:
|
||||
raise ValueError(
|
||||
"When using --ramp-up-strategy, both --ramp-up-start-rps and "
|
||||
"--ramp-up-end-rps must be specified"
|
||||
)
|
||||
if args.ramp_up_start_rps < 0 or args.ramp_up_end_rps < 0:
|
||||
raise ValueError("Ramp-up start and end RPS must be non-negative")
|
||||
if args.ramp_up_start_rps > args.ramp_up_end_rps:
|
||||
raise ValueError("Ramp-up start RPS must be less than end RPS")
|
||||
if args.ramp_up_strategy == "exponential" and args.ramp_up_start_rps == 0:
|
||||
raise ValueError("For exponential ramp-up, the start RPS cannot be 0.")
|
||||
|
||||
if args.base_url is not None:
|
||||
api_url = f"{args.base_url}{args.endpoint}"
|
||||
base_url = f"{args.base_url}"
|
||||
@ -627,7 +737,16 @@ def main(args: argparse.Namespace):
|
||||
"'--dataset-path' if required."
|
||||
)
|
||||
|
||||
if args.dataset_name == "sonnet":
|
||||
if args.dataset_name == "custom":
|
||||
dataset = CustomDataset(dataset_path=args.dataset_path)
|
||||
input_requests = dataset.sample(
|
||||
num_requests=args.num_prompts,
|
||||
tokenizer=tokenizer,
|
||||
output_len=args.custom_output_len,
|
||||
skip_chat_template=args.custom_skip_chat_template,
|
||||
)
|
||||
|
||||
elif args.dataset_name == "sonnet":
|
||||
dataset = SonnetDataset(dataset_path=args.dataset_path)
|
||||
# For the "sonnet" dataset, formatting depends on the backend.
|
||||
if args.backend == "openai-chat":
|
||||
@ -762,6 +881,10 @@ def main(args: argparse.Namespace):
|
||||
if "temperature" not in sampling_params:
|
||||
sampling_params["temperature"] = 0.0 # Default to greedy decoding.
|
||||
|
||||
if args.backend == "llama.cpp":
|
||||
# Disable prompt caching in llama.cpp backend
|
||||
sampling_params["cache_prompt"] = False
|
||||
|
||||
# Avoid GC processing "static" data - reduce pause times.
|
||||
gc.collect()
|
||||
gc.freeze()
|
||||
@ -787,6 +910,9 @@ def main(args: argparse.Namespace):
|
||||
max_concurrency=args.max_concurrency,
|
||||
lora_modules=args.lora_modules,
|
||||
extra_body=sampling_params,
|
||||
ramp_up_strategy=args.ramp_up_strategy,
|
||||
ramp_up_start_rps=args.ramp_up_start_rps,
|
||||
ramp_up_end_rps=args.ramp_up_end_rps,
|
||||
)
|
||||
)
|
||||
|
||||
@ -819,6 +945,11 @@ def main(args: argparse.Namespace):
|
||||
result_json["burstiness"] = args.burstiness
|
||||
result_json["max_concurrency"] = args.max_concurrency
|
||||
|
||||
if args.ramp_up_strategy is not None:
|
||||
result_json["ramp_up_strategy"] = args.ramp_up_strategy
|
||||
result_json["ramp_up_start_rps"] = args.ramp_up_start_rps
|
||||
result_json["ramp_up_end_rps"] = args.ramp_up_end_rps
|
||||
|
||||
# Merge with benchmark result
|
||||
result_json = {**result_json, **benchmark_result}
|
||||
|
||||
@ -834,6 +965,8 @@ def main(args: argparse.Namespace):
|
||||
]:
|
||||
if field in result_json:
|
||||
del result_json[field]
|
||||
if field in benchmark_result:
|
||||
del benchmark_result[field]
|
||||
|
||||
# Save to file
|
||||
base_model_id = model_id.split("/")[-1]
|
||||
@ -842,10 +975,14 @@ def main(args: argparse.Namespace):
|
||||
if args.max_concurrency is not None
|
||||
else ""
|
||||
)
|
||||
file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa
|
||||
if args.ramp_up_strategy is not None:
|
||||
file_name = f"{backend}-ramp-up-{args.ramp_up_strategy}-{args.ramp_up_start_rps}qps-{args.ramp_up_end_rps}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa
|
||||
else:
|
||||
file_name = f"{backend}-{args.request_rate}qps{max_concurrency_str}-{base_model_id}-{current_dt}.json" # noqa
|
||||
if args.result_filename:
|
||||
file_name = args.result_filename
|
||||
if args.result_dir:
|
||||
os.makedirs(args.result_dir, exist_ok=True)
|
||||
file_name = os.path.join(args.result_dir, file_name)
|
||||
with open(
|
||||
file_name, mode="a+" if args.append_result else "w", encoding="utf-8"
|
||||
@ -857,7 +994,7 @@ def main(args: argparse.Namespace):
|
||||
save_to_pytorch_benchmark_format(args, result_json, file_name)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
def create_argument_parser():
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the online serving throughput."
|
||||
)
|
||||
@ -886,7 +1023,7 @@ if __name__ == "__main__":
|
||||
"--dataset-name",
|
||||
type=str,
|
||||
default="sharegpt",
|
||||
choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"],
|
||||
choices=["sharegpt", "burstgpt", "sonnet", "random", "hf", "custom"],
|
||||
help="Name of the dataset to benchmark on.",
|
||||
)
|
||||
parser.add_argument(
|
||||
@ -1056,6 +1193,19 @@ if __name__ == "__main__":
|
||||
)
|
||||
|
||||
# group for dataset specific arguments
|
||||
custom_group = parser.add_argument_group("custom dataset options")
|
||||
custom_group.add_argument(
|
||||
"--custom-output-len",
|
||||
type=int,
|
||||
default=256,
|
||||
help="Number of output tokens per request, used only for custom dataset.",
|
||||
)
|
||||
custom_group.add_argument(
|
||||
"--custom-skip-chat-template",
|
||||
action="store_true",
|
||||
help="Skip applying chat template to prompt, used only for custom dataset.",
|
||||
)
|
||||
|
||||
sonnet_group = parser.add_argument_group("sonnet dataset options")
|
||||
sonnet_group.add_argument(
|
||||
"--sonnet-input-len",
|
||||
@ -1194,6 +1344,35 @@ if __name__ == "__main__":
|
||||
"script chooses a LoRA module at random.",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
parser.add_argument(
|
||||
"--ramp-up-strategy",
|
||||
type=str,
|
||||
default=None,
|
||||
choices=["linear", "exponential"],
|
||||
help="The ramp-up strategy. This would be used to "
|
||||
"ramp up the request rate from initial RPS to final "
|
||||
"RPS rate (specified by --ramp-up-start-rps and --ramp-up-end-rps). "
|
||||
"over the duration of the benchmark.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ramp-up-start-rps",
|
||||
type=int,
|
||||
default=None,
|
||||
help="The starting request rate for ramp-up (RPS). "
|
||||
"Needs to be specified when --ramp-up-strategy is used.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ramp-up-end-rps",
|
||||
type=int,
|
||||
default=None,
|
||||
help="The ending request rate for ramp-up (RPS). "
|
||||
"Needs to be specified when --ramp-up-strategy is used.",
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = create_argument_parser()
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
r"""Benchmark online serving throughput with structured outputs.
|
||||
|
||||
On the server side, run one of the following commands:
|
||||
@ -11,7 +12,6 @@ On the client side, run:
|
||||
--model <your_model> \
|
||||
--dataset json \
|
||||
--structured-output-ratio 1.0 \
|
||||
--structured-output-backend auto \
|
||||
--request-rate 10 \
|
||||
--num-prompts 1000
|
||||
|
||||
@ -850,7 +850,7 @@ def main(args: argparse.Namespace):
|
||||
json.dump(results, outfile, indent=4)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
def create_argument_parser():
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Benchmark the online serving throughput."
|
||||
)
|
||||
@ -1034,5 +1034,10 @@ if __name__ == "__main__":
|
||||
help="Ratio of Structured Outputs requests",
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = create_argument_parser()
|
||||
args = parser.parse_args()
|
||||
main(args)
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Benchmark offline inference throughput."""
|
||||
|
||||
import argparse
|
||||
@ -96,7 +97,7 @@ def run_vllm(
|
||||
assert lora_requests is None, "BeamSearch API does not support LoRA"
|
||||
prompts = [request.prompt for request in requests]
|
||||
# output_len should be the same for all requests.
|
||||
output_len = requests[0][2]
|
||||
output_len = requests[0].expected_output_len
|
||||
for request in requests:
|
||||
assert request.expected_output_len == output_len
|
||||
start = time.perf_counter()
|
||||
@ -594,7 +595,7 @@ def validate_args(args):
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
def create_argument_parser():
|
||||
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
|
||||
parser.add_argument(
|
||||
"--backend",
|
||||
@ -716,6 +717,12 @@ if __name__ == "__main__":
|
||||
)
|
||||
|
||||
parser = AsyncEngineArgs.add_cli_args(parser)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = create_argument_parser()
|
||||
args = parser.parse_args()
|
||||
if args.tokenizer is None:
|
||||
args.tokenizer = args.model
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import json
|
||||
@ -65,4 +66,9 @@ class InfEncoder(json.JSONEncoder):
|
||||
|
||||
def write_to_json(filename: str, records: list) -> None:
|
||||
with open(filename, "w") as f:
|
||||
json.dump(records, f, cls=InfEncoder)
|
||||
json.dump(
|
||||
records,
|
||||
f,
|
||||
cls=InfEncoder,
|
||||
default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
|
||||
)
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import copy
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Cutlass bench utils
|
||||
from collections.abc import Iterable
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import copy
|
||||
@ -18,7 +19,7 @@ from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
|
||||
w8a8_block_fp8_matmul,
|
||||
)
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
from vllm.utils import FlexibleArgumentParser, cdiv
|
||||
|
||||
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
|
||||
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
|
||||
@ -116,14 +117,9 @@ def bench_fp8(
|
||||
scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||
scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
|
||||
|
||||
def ceil_div(x: int, y: int) -> int:
|
||||
return (x + y - 1) // y
|
||||
|
||||
block_scale_a = torch.rand(
|
||||
(m, ceil_div(k, 128)), device="cuda", dtype=torch.float32
|
||||
)
|
||||
block_scale_a = torch.rand((m, cdiv(k, 128)), device="cuda", dtype=torch.float32)
|
||||
block_scale_b = torch.rand(
|
||||
ceil_div(k, 128), ceil_div(n, 128), device="cuda", dtype=torch.float32
|
||||
cdiv(k, 128), cdiv(n, 128), device="cuda", dtype=torch.float32
|
||||
)
|
||||
block_scale_a_M_major = block_scale_a.t().contiguous().t()
|
||||
block_scale_b_K_major = block_scale_b.t().contiguous().t()
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Weight Shapes are in the format
|
||||
# ([K, N], TP_SPLIT_DIM)
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import asyncio
|
||||
import itertools
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import json
|
||||
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import pickle as pkl
|
||||
import time
|
||||
|
||||
158
benchmarks/kernels/bench_fp8_gemm.py
Normal file
158
benchmarks/kernels/bench_fp8_gemm.py
Normal file
@ -0,0 +1,158 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
import argparse
|
||||
import copy
|
||||
import itertools
|
||||
|
||||
import torch
|
||||
from weight_shapes import WEIGHT_SHAPES
|
||||
|
||||
from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
|
||||
from vllm._custom_ops import scaled_fp8_quant as vllm_scaled_fp8_quant
|
||||
from vllm.triton_utils import triton
|
||||
|
||||
PROVIDER_CFGS = {
|
||||
"torch-bf16": dict(enabled=True),
|
||||
"fp8-tensor-w-token-a": dict(
|
||||
w="tensor", a="token", no_a_quant=False, enabled=False
|
||||
),
|
||||
"fp8-tensor-w-tensor-a": dict(
|
||||
w="tensor", a="tensor", no_a_quant=False, enabled=True
|
||||
),
|
||||
"fp8-channel-w-token-a": dict(
|
||||
w="channel", a="token", no_a_quant=False, enabled=True
|
||||
),
|
||||
"fp8-channel-w-tensor-a": dict(
|
||||
w="channel", a="tensor", no_a_quant=False, enabled=False
|
||||
),
|
||||
"fp8-tensor-w-token-a-noquant": dict(
|
||||
w="tensor", a="token", no_a_quant=True, enabled=False
|
||||
),
|
||||
"fp8-tensor-w-tensor-a-noquant": dict(
|
||||
w="tensor", a="tensor", no_a_quant=True, enabled=True
|
||||
),
|
||||
"fp8-channel-w-token-a-noquant": dict(
|
||||
w="channel", a="token", no_a_quant=True, enabled=True
|
||||
),
|
||||
"fp8-channel-w-tensor-a-noquant": dict(
|
||||
w="channel", a="tensor", no_a_quant=True, enabled=False
|
||||
),
|
||||
}
|
||||
|
||||
_enabled = [k for k, v in PROVIDER_CFGS.items() if v["enabled"]]
|
||||
|
||||
|
||||
def _quant_weight_fp8(b: torch.Tensor, w_type: str, device: str):
|
||||
if w_type == "tensor":
|
||||
scale_b = torch.ones(1, device=device, dtype=torch.float32)
|
||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, scale_b)
|
||||
else:
|
||||
b_fp8, scale_b_fp8 = vllm_scaled_fp8_quant(b, use_per_token_if_dynamic=True)
|
||||
return b_fp8.t(), scale_b_fp8
|
||||
|
||||
|
||||
def build_fp8_runner(cfg, a, b, dtype, device):
|
||||
b_fp8, scale_b_fp8 = _quant_weight_fp8(b, cfg["w"], device)
|
||||
|
||||
scale_a_const = (
|
||||
torch.ones(1, device=device, dtype=torch.float32)
|
||||
if cfg["a"] == "tensor"
|
||||
else None
|
||||
)
|
||||
|
||||
if cfg["no_a_quant"]:
|
||||
if cfg["a"] == "tensor":
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a_const)
|
||||
else:
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, use_per_token_if_dynamic=True)
|
||||
|
||||
def run():
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
return run
|
||||
|
||||
if cfg["a"] == "tensor":
|
||||
|
||||
def run():
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, scale_a_const)
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
else:
|
||||
|
||||
def run():
|
||||
a_fp8, scale_a_fp8 = vllm_scaled_fp8_quant(a, use_per_token_if_dynamic=True)
|
||||
return vllm_scaled_mm(a_fp8, b_fp8, scale_a_fp8, scale_b_fp8, dtype)
|
||||
|
||||
return run
|
||||
|
||||
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["batch_size"],
|
||||
x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
|
||||
x_log=False,
|
||||
line_arg="provider",
|
||||
line_vals=_enabled,
|
||||
line_names=_enabled,
|
||||
ylabel="TFLOP/s (larger is better)",
|
||||
plot_name="BF16 vs FP8 GEMMs",
|
||||
args={},
|
||||
)
|
||||
)
|
||||
def benchmark(batch_size, provider, N, K):
|
||||
M = batch_size
|
||||
device = "cuda"
|
||||
dtype = torch.bfloat16
|
||||
|
||||
a = torch.randn((M, K), device=device, dtype=dtype)
|
||||
b = torch.randn((N, K), device=device, dtype=dtype)
|
||||
|
||||
quantiles = [0.5, 0.2, 0.8]
|
||||
|
||||
if provider == "torch-bf16":
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
|
||||
)
|
||||
else:
|
||||
cfg = PROVIDER_CFGS[provider]
|
||||
run_quant = build_fp8_runner(cfg, a, b, dtype, device)
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: run_quant(), quantiles=quantiles
|
||||
)
|
||||
|
||||
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
|
||||
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
|
||||
|
||||
|
||||
def prepare_shapes(args):
|
||||
out = []
|
||||
for model, tp_size in itertools.product(args.models, args.tp_sizes):
|
||||
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
|
||||
KN[tp_dim] //= tp_size
|
||||
KN.append(model)
|
||||
out.append(KN)
|
||||
return out
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--models",
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=["meta-llama/Llama-3.1-8B-Instruct"],
|
||||
choices=list(WEIGHT_SHAPES.keys()),
|
||||
)
|
||||
parser.add_argument("--tp-sizes", nargs="+", type=int, default=[1])
|
||||
args = parser.parse_args()
|
||||
|
||||
for K, N, model in prepare_shapes(args):
|
||||
print(f"{model}, N={N} K={K}, BF16 vs FP8 GEMMs TFLOP/s:")
|
||||
benchmark.run(
|
||||
print_data=True,
|
||||
show_plots=True,
|
||||
save_path=f"bench_fp8_res_n{N}_k{K}",
|
||||
N=N,
|
||||
K=K,
|
||||
)
|
||||
|
||||
print("Benchmark finished!")
|
||||
169
benchmarks/kernels/bench_int8_gemm.py
Normal file
169
benchmarks/kernels/bench_int8_gemm.py
Normal file
@ -0,0 +1,169 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
import copy
|
||||
import itertools
|
||||
|
||||
import torch
|
||||
from weight_shapes import WEIGHT_SHAPES
|
||||
|
||||
from vllm._custom_ops import cutlass_scaled_mm as vllm_scaled_mm
|
||||
from vllm._custom_ops import scaled_int8_quant as vllm_scaled_int8_quant
|
||||
from vllm.triton_utils import triton
|
||||
|
||||
PROVIDER_CFGS = {
|
||||
"torch-bf16": dict(enabled=True),
|
||||
"int8-tensor-w-token-a": dict(
|
||||
w="tensor", a="token", no_a_quant=False, enabled=False
|
||||
),
|
||||
"int8-tensor-w-tensor-a": dict(
|
||||
w="tensor", a="tensor", no_a_quant=False, enabled=True
|
||||
),
|
||||
"int8-channel-w-token-a": dict(
|
||||
w="channel", a="token", no_a_quant=False, enabled=True
|
||||
),
|
||||
"int8-channel-w-tensor-a": dict(
|
||||
w="channel", a="tensor", no_a_quant=False, enabled=False
|
||||
),
|
||||
"int8-tensor-w-token-a-noquant": dict(
|
||||
w="tensor", a="token", no_a_quant=True, enabled=False
|
||||
),
|
||||
"int8-tensor-w-tensor-a-noquant": dict(
|
||||
w="tensor", a="tensor", no_a_quant=True, enabled=True
|
||||
),
|
||||
"int8-channel-w-token-a-noquant": dict(
|
||||
w="channel", a="token", no_a_quant=True, enabled=True
|
||||
),
|
||||
"int8-channel-w-tensor-a-noquant": dict(
|
||||
w="channel", a="tensor", no_a_quant=True, enabled=False
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def _quant_weight(b, w_type, device):
|
||||
if w_type == "tensor":
|
||||
scale_b = torch.ones(1, device=device, dtype=torch.float32)
|
||||
b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b, scale_b)
|
||||
assert scale_b_int8.numel() == 1
|
||||
else: # channel
|
||||
b_int8, scale_b_int8, _ = vllm_scaled_int8_quant(b)
|
||||
assert scale_b_int8.numel() == b.shape[0]
|
||||
return b_int8.t(), scale_b_int8
|
||||
|
||||
|
||||
def build_int8_runner(cfg, a, b, dtype, device):
|
||||
# quant before running the kernel
|
||||
b_int8, scale_b_int8 = _quant_weight(b, cfg["w"], device)
|
||||
|
||||
scale_a_const = None
|
||||
if cfg["a"] == "tensor":
|
||||
scale_a_const = torch.ones(1, device=device, dtype=torch.float32)
|
||||
|
||||
# no quant, create activation ahead
|
||||
if cfg["no_a_quant"]:
|
||||
if cfg["a"] == "tensor":
|
||||
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a_const)
|
||||
else: # token
|
||||
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a)
|
||||
|
||||
def run_quant():
|
||||
return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype)
|
||||
|
||||
return run_quant
|
||||
|
||||
# dynamic quant, create activation inside
|
||||
if cfg["a"] == "tensor":
|
||||
|
||||
def run_quant():
|
||||
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a, scale_a_const)
|
||||
return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype)
|
||||
|
||||
else: # token
|
||||
|
||||
def run_quant():
|
||||
a_int8, scale_a_int8, _ = vllm_scaled_int8_quant(a)
|
||||
return vllm_scaled_mm(a_int8, b_int8, scale_a_int8, scale_b_int8, dtype)
|
||||
|
||||
return run_quant
|
||||
|
||||
|
||||
_enabled = [k for k, v in PROVIDER_CFGS.items() if v.get("enabled")]
|
||||
|
||||
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["batch_size"],
|
||||
x_vals=[1, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384],
|
||||
x_log=False,
|
||||
line_arg="provider",
|
||||
line_vals=_enabled,
|
||||
line_names=[k for k in _enabled],
|
||||
ylabel="TFLOP/s (larger is better)",
|
||||
plot_name="BF16 vs INT8 GEMMs",
|
||||
args={},
|
||||
)
|
||||
)
|
||||
def benchmark(batch_size, provider, N, K):
|
||||
M = batch_size
|
||||
device = "cuda"
|
||||
dtype = torch.bfloat16
|
||||
a = torch.randn((M, K), device=device, dtype=dtype)
|
||||
b = torch.randn((N, K), device=device, dtype=dtype)
|
||||
|
||||
quantiles = [0.5, 0.2, 0.8]
|
||||
|
||||
if provider == "torch-bf16":
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: torch.nn.functional.linear(a, b), quantiles=quantiles
|
||||
)
|
||||
else:
|
||||
cfg = PROVIDER_CFGS[provider]
|
||||
run_quant = build_int8_runner(cfg, a, b, dtype, device)
|
||||
ms, min_ms, max_ms = triton.testing.do_bench_cudagraph(
|
||||
lambda: run_quant(), quantiles=quantiles
|
||||
)
|
||||
|
||||
to_tflops = lambda t_ms: (2 * M * N * K) * 1e-12 / (t_ms * 1e-3)
|
||||
return to_tflops(ms), to_tflops(max_ms), to_tflops(min_ms)
|
||||
|
||||
|
||||
def prepare_shapes(args):
|
||||
KN_model_names = []
|
||||
for model, tp_size in itertools.product(args.models, args.tp_sizes):
|
||||
for KN, tp_dim in copy.deepcopy(WEIGHT_SHAPES[model]):
|
||||
KN[tp_dim] //= tp_size
|
||||
KN.append(model)
|
||||
KN_model_names.append(KN)
|
||||
return KN_model_names
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--models",
|
||||
nargs="+",
|
||||
type=str,
|
||||
default=["meta-llama/Llama-3.1-8B-Instruct"],
|
||||
choices=list(WEIGHT_SHAPES.keys()),
|
||||
help="List of models to benchmark",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--tp-sizes",
|
||||
nargs="+",
|
||||
type=int,
|
||||
default=[1],
|
||||
help="List of tensor parallel sizes",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
for K, N, model in prepare_shapes(args):
|
||||
print(f"{model}, N={N} K={K}, BF16 vs INT8 GEMMs TFLOP/s:")
|
||||
benchmark.run(
|
||||
print_data=True,
|
||||
show_plots=True,
|
||||
save_path=f"bench_int8_res_n{N}_k{K}",
|
||||
N=N,
|
||||
K=K,
|
||||
)
|
||||
|
||||
print("Benchmark finished!")
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT License.
|
||||
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
Benchmark the performance of the cutlass_moe_fp4 kernel vs the triton_moe
|
||||
kernel. The cutlass_moe_fp4 kernel takes in fp4 quantized weights and 16-bit
|
||||
@ -90,7 +91,7 @@ def bench_run(
|
||||
|
||||
score = torch.randn((m, num_experts), device=device, dtype=dtype)
|
||||
|
||||
topk_weights, topk_ids = fused_topk(a, score, topk, renormalize=False)
|
||||
topk_weights, topk_ids, _ = fused_topk(a, score, topk, renormalize=False)
|
||||
|
||||
quant_blocksize = 16
|
||||
w1_blockscale = torch.empty(
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as benchmark
|
||||
@ -6,8 +7,8 @@ from benchmark_shapes import WEIGHT_SHAPES_MOE
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp8
|
||||
from vllm.model_executor.layers.fused_moe.fused_moe import (
|
||||
cutlass_moe_fp8,
|
||||
fused_experts,
|
||||
fused_topk,
|
||||
)
|
||||
@ -69,18 +70,9 @@ def bench_run(
|
||||
w1_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
|
||||
w2_scale = torch.empty((num_experts, 1, 1), device="cuda", dtype=torch.float32)
|
||||
|
||||
ab_strides1 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
|
||||
c_strides1 = torch.full((num_experts,), 2 * n, device="cuda", dtype=torch.int64)
|
||||
ab_strides2 = torch.full((num_experts,), n, device="cuda", dtype=torch.int64)
|
||||
c_strides2 = torch.full((num_experts,), k, device="cuda", dtype=torch.int64)
|
||||
|
||||
for expert in range(num_experts):
|
||||
w1_q[expert], w1_scale[expert] = ops.scaled_fp8_quant(w1[expert])
|
||||
w2_q[expert], w2_scale[expert] = ops.scaled_fp8_quant(w2[expert])
|
||||
w1_q_notransp = w1_q.clone()
|
||||
w2_q_notransp = w2_q.clone()
|
||||
w1_q = w1_q.transpose(1, 2)
|
||||
w2_q = w2_q.transpose(1, 2)
|
||||
|
||||
score = torch.randn((m, num_experts), device="cuda", dtype=dtype)
|
||||
|
||||
@ -121,10 +113,6 @@ def bench_run(
|
||||
w2_scale: torch.Tensor,
|
||||
topk_weights: torch.Tensor,
|
||||
topk_ids: torch.Tensor,
|
||||
ab_strides1: torch.Tensor,
|
||||
c_strides1: torch.Tensor,
|
||||
ab_strides2: torch.Tensor,
|
||||
c_strides2: torch.Tensor,
|
||||
num_repeats: int,
|
||||
):
|
||||
for _ in range(num_repeats):
|
||||
@ -132,14 +120,10 @@ def bench_run(
|
||||
a,
|
||||
w1,
|
||||
w2,
|
||||
w1_scale,
|
||||
w2_scale,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
ab_strides1,
|
||||
c_strides1,
|
||||
ab_strides2,
|
||||
c_strides2,
|
||||
w1_scale,
|
||||
w2_scale,
|
||||
a1_scale=a_scale,
|
||||
)
|
||||
|
||||
@ -152,10 +136,6 @@ def bench_run(
|
||||
w2_scale: torch.Tensor,
|
||||
topk_weights: torch.Tensor,
|
||||
topk_ids: torch.Tensor,
|
||||
ab_strides1: torch.Tensor,
|
||||
c_strides1: torch.Tensor,
|
||||
ab_strides2: torch.Tensor,
|
||||
c_strides2: torch.Tensor,
|
||||
):
|
||||
with set_current_vllm_config(
|
||||
VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1))
|
||||
@ -164,14 +144,10 @@ def bench_run(
|
||||
a,
|
||||
w1_q,
|
||||
w2_q,
|
||||
w1_scale,
|
||||
w2_scale,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
ab_strides1,
|
||||
c_strides1,
|
||||
ab_strides2,
|
||||
c_strides2,
|
||||
w1_scale,
|
||||
w2_scale,
|
||||
a1_scale=a_scale,
|
||||
)
|
||||
|
||||
@ -217,10 +193,6 @@ def bench_run(
|
||||
w2_scale,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
ab_strides1,
|
||||
c_strides1,
|
||||
ab_strides2,
|
||||
c_strides2,
|
||||
)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
@ -229,8 +201,8 @@ def bench_run(
|
||||
with torch.cuda.graph(triton_graph, stream=triton_stream):
|
||||
run_triton_from_graph(
|
||||
a,
|
||||
w1_q_notransp,
|
||||
w2_q_notransp,
|
||||
w1_q,
|
||||
w2_q,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
w1_scale,
|
||||
@ -249,18 +221,12 @@ def bench_run(
|
||||
"w2": w2,
|
||||
"score": score,
|
||||
"topk": topk,
|
||||
"w1_q_notransp": w1_q_notransp,
|
||||
"w2_q_notransp": w2_q_notransp,
|
||||
# Cutlass params
|
||||
"a_scale": a_scale,
|
||||
"w1_q": w1_q,
|
||||
"w2_q": w2_q,
|
||||
"w1_scale": w1_scale,
|
||||
"w2_scale": w2_scale,
|
||||
"ab_strides1": ab_strides1,
|
||||
"c_strides1": c_strides1,
|
||||
"ab_strides2": ab_strides2,
|
||||
"c_strides2": c_strides2,
|
||||
# cuda graph params
|
||||
"cutlass_graph": cutlass_graph,
|
||||
"triton_graph": triton_graph,
|
||||
@ -278,8 +244,8 @@ def bench_run(
|
||||
# Warmup
|
||||
run_triton_moe(
|
||||
a,
|
||||
w1_q_notransp,
|
||||
w2_q_notransp,
|
||||
w1_q,
|
||||
w2_q,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
w1_scale,
|
||||
@ -290,7 +256,7 @@ def bench_run(
|
||||
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt="run_triton_moe(a, w1_q_notransp, w2_q_notransp, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501
|
||||
stmt="run_triton_moe(a, w1_q, w2_q, topk_weights, topk_ids, w1_scale, w2_scale, a_scale, num_runs)", # noqa: E501
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
@ -321,16 +287,12 @@ def bench_run(
|
||||
w2_scale,
|
||||
topk_weights,
|
||||
topk_ids,
|
||||
ab_strides1,
|
||||
c_strides1,
|
||||
ab_strides2,
|
||||
c_strides2,
|
||||
num_warmup,
|
||||
)
|
||||
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, ab_strides1, c_strides1, ab_strides2, c_strides2, num_runs)", # noqa: E501
|
||||
stmt="run_cutlass_moe(a, a_scale, w1_q, w2_q, w1_scale, w2_scale, topk_weights, topk_ids, num_runs)", # noqa: E501
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import time
|
||||
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import copy
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import copy
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import torch
|
||||
import torch.utils.benchmark as benchmark
|
||||
@ -21,8 +22,16 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils import (
|
||||
MARLIN_SUPPORTED_GROUP_SIZES,
|
||||
query_marlin_supported_quant_types,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp4 import (
|
||||
FP4_MARLIN_SUPPORTED_GROUP_SIZES,
|
||||
rand_marlin_weight_fp4_like,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils_fp8 import (
|
||||
marlin_quant_fp8_torch,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
|
||||
MarlinWorkspace,
|
||||
awq_marlin_quantize,
|
||||
marlin_quantize,
|
||||
)
|
||||
from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
|
||||
@ -34,7 +43,7 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
||||
quantize_weights,
|
||||
sort_weights,
|
||||
)
|
||||
from vllm.scalar_type import ScalarType
|
||||
from vllm.scalar_type import ScalarType, scalar_types
|
||||
from vllm.utils import FlexibleArgumentParser
|
||||
|
||||
DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
|
||||
@ -56,80 +65,144 @@ def bench_run(
|
||||
size_n: int,
|
||||
):
|
||||
label = "Quant Matmul"
|
||||
|
||||
sub_label = "{}, act={} k_full={}, q={}, g={}, MKN=({}x{}x{})".format(
|
||||
model, act_order, is_k_full, str(quant_type), group_size, size_m, size_k, size_n
|
||||
)
|
||||
|
||||
print(f"Testing: {sub_label}")
|
||||
|
||||
a = torch.randn(size_m, size_k).to(torch.half).cuda()
|
||||
b = torch.rand(size_k, size_n).to(torch.half).cuda()
|
||||
has_zp = quant_type in [scalar_types.uint4, scalar_types.uint8]
|
||||
if act_order and (group_size == -1 or group_size == size_k or has_zp):
|
||||
return
|
||||
if size_k % group_size != 0:
|
||||
return
|
||||
|
||||
a_tmp = torch.zeros(size_m, size_k).to(torch.half).cuda()
|
||||
|
||||
# Marlin quant
|
||||
(
|
||||
marlin_w_ref,
|
||||
marlin_q_w,
|
||||
marlin_s,
|
||||
marlin_g_idx,
|
||||
marlin_sort_indices,
|
||||
marlin_rand_perm,
|
||||
) = marlin_quantize(b, quant_type, group_size, act_order)
|
||||
|
||||
# Marlin_24 quant
|
||||
(marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = (
|
||||
marlin_24_quantize(b, quant_type, group_size)
|
||||
marlin_24_supported = (
|
||||
quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
|
||||
and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
|
||||
)
|
||||
|
||||
marlin_zp = torch.empty(0, dtype=torch.int, device=b.device)
|
||||
|
||||
# GPTQ quant
|
||||
(w_ref, q_w, s, g_idx, rand_perm) = gptq_quantize_weights(
|
||||
b, quant_type, group_size, act_order
|
||||
repack_supported = (
|
||||
quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
|
||||
and group_size in MARLIN_SUPPORTED_GROUP_SIZES
|
||||
)
|
||||
q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
|
||||
|
||||
# For act_order, sort the "weights" and "g_idx"
|
||||
# so that group ids are increasing
|
||||
repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
|
||||
if act_order:
|
||||
(q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
|
||||
|
||||
# Prepare
|
||||
marlin_workspace = MarlinWorkspace(
|
||||
size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL
|
||||
)
|
||||
|
||||
marlin_24_workspace = MarlinWorkspace(
|
||||
size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL
|
||||
)
|
||||
marlin_zp = torch.zeros_like(marlin_s, dtype=torch.int)
|
||||
|
||||
# AllSpark W8A16 quant
|
||||
as_supported_case = (
|
||||
allspark_supported = (
|
||||
quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES
|
||||
and group_size == -1
|
||||
and not act_order
|
||||
and is_k_full
|
||||
)
|
||||
if as_supported_case:
|
||||
properties = torch.cuda.get_device_properties(b.device.index)
|
||||
sm_count = properties.multi_processor_count
|
||||
sm_version = properties.major * 10 + properties.minor
|
||||
|
||||
supported_arch = sm_version >= 80 and sm_version < 90
|
||||
as_supported_case = as_supported_case and supported_arch
|
||||
if supported_arch:
|
||||
has_zp = False
|
||||
w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size, has_zp)
|
||||
qw = qw.to(torch.uint8)
|
||||
|
||||
qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight(
|
||||
qw, s, zp, has_zp
|
||||
def gen_marlin_params():
|
||||
# Marlin quant
|
||||
marlin_g_idx = marlin_sort_indices = marlin_zp = marlin_s2 = None
|
||||
if quant_type == scalar_types.float4_e2m1f:
|
||||
if group_size != 16 or act_order:
|
||||
return
|
||||
marlin_w_ref, marlin_q_w, marlin_s, marlin_s2 = rand_marlin_weight_fp4_like(
|
||||
b.T, group_size
|
||||
)
|
||||
CUBLAS_M_THRESHOLD = ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD
|
||||
elif quant_type == scalar_types.float8_e4m3fn:
|
||||
if group_size not in [-1, 128] or act_order:
|
||||
return
|
||||
marlin_w_ref, marlin_q_w, marlin_s = marlin_quant_fp8_torch(b.T, group_size)
|
||||
elif group_size == 16:
|
||||
return
|
||||
elif has_zp:
|
||||
marlin_w_ref, marlin_q_w, marlin_s, marlin_zp = awq_marlin_quantize(
|
||||
b, quant_type, group_size
|
||||
)
|
||||
else:
|
||||
marlin_w_ref, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, _ = (
|
||||
marlin_quantize(b, quant_type, group_size, act_order)
|
||||
)
|
||||
return (
|
||||
marlin_w_ref,
|
||||
marlin_q_w,
|
||||
marlin_s,
|
||||
marlin_s2,
|
||||
marlin_zp,
|
||||
marlin_g_idx,
|
||||
marlin_sort_indices,
|
||||
)
|
||||
|
||||
def gen_marlin_24_params():
|
||||
marlin_24_w_ref = marlin_24_q_w_comp = marlin_24_meta = marlin_24_s = None
|
||||
if marlin_24_supported:
|
||||
(marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s) = (
|
||||
marlin_24_quantize(b, quant_type, group_size)
|
||||
)
|
||||
return (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s)
|
||||
|
||||
def gen_repack_params():
|
||||
q_w_gptq = None
|
||||
repack_sort_indices = None
|
||||
if repack_supported:
|
||||
(w_ref, q_w, s, g_idx, rand_perm) = gptq_quantize_weights(
|
||||
b, quant_type, group_size, act_order
|
||||
)
|
||||
q_w_gptq = gptq_pack(q_w, quant_type.size_bits, size_k, size_n)
|
||||
|
||||
# For act_order, sort the "weights" and "g_idx"
|
||||
# so that group ids are increasing
|
||||
repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
|
||||
if act_order:
|
||||
(q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
|
||||
return q_w_gptq, repack_sort_indices
|
||||
|
||||
def gen_allspark_params():
|
||||
qw_reorder = s_reorder = zp_reorder = sm_count = sm_version = (
|
||||
CUBLAS_M_THRESHOLD
|
||||
) = None
|
||||
nonlocal allspark_supported
|
||||
if allspark_supported:
|
||||
properties = torch.cuda.get_device_properties(b.device.index)
|
||||
sm_count = properties.multi_processor_count
|
||||
sm_version = properties.major * 10 + properties.minor
|
||||
|
||||
supported_arch = sm_version >= 80 and sm_version < 90
|
||||
allspark_supported = allspark_supported and supported_arch
|
||||
if supported_arch:
|
||||
w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size, has_zp)
|
||||
qw = qw.to(torch.uint8)
|
||||
|
||||
qw_reorder, s_reorder, zp_reorder = ops.allspark_repack_weight(
|
||||
qw, s, zp, has_zp
|
||||
)
|
||||
CUBLAS_M_THRESHOLD = ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD
|
||||
return (
|
||||
qw_reorder,
|
||||
s_reorder,
|
||||
zp_reorder,
|
||||
sm_count,
|
||||
sm_version,
|
||||
CUBLAS_M_THRESHOLD,
|
||||
)
|
||||
|
||||
(
|
||||
marlin_w_ref,
|
||||
marlin_q_w,
|
||||
marlin_s,
|
||||
marlin_s2,
|
||||
marlin_zp,
|
||||
marlin_g_idx,
|
||||
marlin_sort_indices,
|
||||
) = gen_marlin_params()
|
||||
marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s = (
|
||||
gen_marlin_24_params()
|
||||
)
|
||||
q_w_gptq, repack_sort_indices = gen_repack_params()
|
||||
qw_reorder, s_reorder, zp_reorder, sm_count, sm_version, CUBLAS_M_THRESHOLD = (
|
||||
gen_allspark_params()
|
||||
)
|
||||
|
||||
# Prepare
|
||||
marlin_workspace = MarlinWorkspace(
|
||||
size_n, GPTQ_MARLIN_MIN_THREAD_N, GPTQ_MARLIN_MAX_PARALLEL
|
||||
)
|
||||
marlin_24_workspace = MarlinWorkspace(
|
||||
size_n, GPTQ_MARLIN_24_MIN_THREAD_N, GPTQ_MARLIN_24_MAX_PARALLEL
|
||||
)
|
||||
|
||||
globals = {
|
||||
# Gen params
|
||||
@ -139,15 +212,14 @@ def bench_run(
|
||||
"size_n": size_n,
|
||||
"size_k": size_k,
|
||||
"a": a,
|
||||
"a_tmp": a_tmp,
|
||||
# Marlin params
|
||||
"marlin_w_ref": marlin_w_ref,
|
||||
"marlin_q_w": marlin_q_w,
|
||||
"marlin_s": marlin_s,
|
||||
"marlin_s2": marlin_s2,
|
||||
"marlin_zp": marlin_zp,
|
||||
"marlin_g_idx": marlin_g_idx,
|
||||
"marlin_sort_indices": marlin_sort_indices,
|
||||
"marlin_rand_perm": marlin_rand_perm,
|
||||
"marlin_workspace": marlin_workspace,
|
||||
"is_k_full": is_k_full,
|
||||
# Marlin_24 params
|
||||
@ -160,12 +232,12 @@ def bench_run(
|
||||
"q_w_gptq": q_w_gptq,
|
||||
"repack_sort_indices": repack_sort_indices,
|
||||
# AllSpark W8A16 params
|
||||
"qw_reorder": qw_reorder if as_supported_case else None,
|
||||
"s_reorder": s_reorder if as_supported_case else None,
|
||||
"zp_reorder": zp_reorder if as_supported_case else None,
|
||||
"sm_count": sm_count if as_supported_case else None,
|
||||
"sm_version": sm_version if as_supported_case else None,
|
||||
"CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD if as_supported_case else None,
|
||||
"qw_reorder": qw_reorder,
|
||||
"s_reorder": s_reorder,
|
||||
"zp_reorder": zp_reorder,
|
||||
"sm_count": sm_count,
|
||||
"sm_version": sm_version,
|
||||
"CUBLAS_M_THRESHOLD": CUBLAS_M_THRESHOLD,
|
||||
# Kernels
|
||||
"gptq_marlin_gemm": ops.gptq_marlin_gemm,
|
||||
"gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
|
||||
@ -176,7 +248,7 @@ def bench_run(
|
||||
min_run_time = 1
|
||||
|
||||
# Warmup pytorch
|
||||
for i in range(5):
|
||||
for _ in range(5):
|
||||
torch.matmul(a, marlin_w_ref)
|
||||
|
||||
results.append(
|
||||
@ -191,17 +263,17 @@ def bench_run(
|
||||
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt="output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
|
||||
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, False, False)", # noqa: E501
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
description="gptq_marlin_gemm_fp16",
|
||||
description="gptq_marlin_gemm",
|
||||
).blocked_autorange(min_run_time=min_run_time)
|
||||
)
|
||||
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt="output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
|
||||
stmt="output = gptq_marlin_gemm(a, None, marlin_q_w, marlin_s, marlin_s2, marlin_zp, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, quant_type, size_m, size_n, size_k, is_k_full, False, True, False)", # noqa: E501
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
@ -209,10 +281,7 @@ def bench_run(
|
||||
).blocked_autorange(min_run_time=min_run_time)
|
||||
)
|
||||
|
||||
if (
|
||||
quant_type in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES
|
||||
and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES
|
||||
):
|
||||
if marlin_24_supported:
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt="output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, quant_type, size_m, size_n, size_k)", # noqa: E501
|
||||
@ -223,17 +292,18 @@ def bench_run(
|
||||
).blocked_autorange(min_run_time=min_run_time)
|
||||
)
|
||||
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt="q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)", # noqa: E501
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
description="gptq_marlin_repack",
|
||||
).blocked_autorange(min_run_time=min_run_time)
|
||||
)
|
||||
if repack_supported:
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt="q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, quant_type.size_bits)", # noqa: E501
|
||||
globals=globals,
|
||||
label=label,
|
||||
sub_label=sub_label,
|
||||
description="gptq_marlin_repack",
|
||||
).blocked_autorange(min_run_time=min_run_time)
|
||||
)
|
||||
|
||||
if as_supported_case:
|
||||
if allspark_supported:
|
||||
results.append(
|
||||
benchmark.Timer(
|
||||
stmt="output = allspark_w8a16_gemm(a, qw_reorder, s_reorder, zp_reorder, size_n, group_size, sm_count, sm_version, CUBLAS_M_THRESHOLD, False, True)", # noqa: E501
|
||||
@ -249,7 +319,6 @@ def main(args):
|
||||
print("Benchmarking models:")
|
||||
for i, model in enumerate(args.models):
|
||||
print(f"[{i}] {model}")
|
||||
|
||||
results: list[benchmark.Measurement] = []
|
||||
|
||||
for model in args.models:
|
||||
@ -277,14 +346,17 @@ def main(args):
|
||||
):
|
||||
continue
|
||||
|
||||
for quant_type in query_marlin_supported_quant_types(False):
|
||||
for quant_type in query_marlin_supported_quant_types():
|
||||
if (
|
||||
len(args.limit_num_bits) > 0
|
||||
and quant_type.size_bits not in args.limit_num_bits
|
||||
):
|
||||
continue
|
||||
|
||||
for group_size in MARLIN_SUPPORTED_GROUP_SIZES:
|
||||
for group_size in (
|
||||
MARLIN_SUPPORTED_GROUP_SIZES
|
||||
+ FP4_MARLIN_SUPPORTED_GROUP_SIZES
|
||||
):
|
||||
if (
|
||||
len(args.limit_group_size) > 0
|
||||
and group_size not in args.limit_group_size
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import json
|
||||
@ -6,7 +7,6 @@ import time
|
||||
from contextlib import nullcontext
|
||||
from datetime import datetime
|
||||
from itertools import product
|
||||
from types import SimpleNamespace
|
||||
from typing import Any, TypedDict
|
||||
|
||||
import ray
|
||||
@ -42,7 +42,7 @@ def benchmark_config(
|
||||
use_fp8_w8a8: bool,
|
||||
use_int8_w8a16: bool,
|
||||
num_iters: int = 100,
|
||||
block_quant_shape: List[int] = None,
|
||||
block_quant_shape: list[int] = None,
|
||||
use_deep_gemm: bool = False,
|
||||
) -> float:
|
||||
init_dtype = torch.float16 if use_fp8_w8a8 else dtype
|
||||
@ -399,7 +399,7 @@ class BenchmarkWorker:
|
||||
dtype: torch.dtype,
|
||||
use_fp8_w8a8: bool,
|
||||
use_int8_w8a16: bool,
|
||||
block_quant_shape: List[int] = None,
|
||||
block_quant_shape: list[int] = None,
|
||||
use_deep_gemm: bool = False,
|
||||
) -> tuple[dict[str, int], float]:
|
||||
current_platform.seed_everything(self.seed)
|
||||
@ -531,7 +531,7 @@ def save_configs(
|
||||
dtype: torch.dtype,
|
||||
use_fp8_w8a8: bool,
|
||||
use_int8_w8a16: bool,
|
||||
block_quant_shape: List[int],
|
||||
block_quant_shape: list[int],
|
||||
) -> None:
|
||||
dtype_str = get_config_dtype_str(
|
||||
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
|
||||
@ -562,7 +562,6 @@ def main(args: argparse.Namespace):
|
||||
config = get_config(model=args.model, trust_remote_code=args.trust_remote_code)
|
||||
if args.model_prefix:
|
||||
config = getattr(config, args.model_prefix)
|
||||
config = SimpleNamespace(**config)
|
||||
|
||||
if config.architectures[0] == "DbrxForCausalLM":
|
||||
E = config.ffn_config.moe_num_experts
|
||||
@ -594,11 +593,7 @@ def main(args: argparse.Namespace):
|
||||
shard_intermediate_size = 2 * intermediate_size // args.tp_size
|
||||
|
||||
hidden_size = config.hidden_size
|
||||
dtype = (
|
||||
torch.float16
|
||||
if current_platform.is_rocm()
|
||||
else getattr(torch, config.torch_dtype)
|
||||
)
|
||||
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
|
||||
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
|
||||
use_int8_w8a16 = args.dtype == "int8_w8a16"
|
||||
block_quant_shape = get_weight_block_size_safety(config)
|
||||
|
||||
159
benchmarks/kernels/benchmark_moe_align_block_size.py
Normal file
159
benchmarks/kernels/benchmark_moe_align_block_size.py
Normal file
@ -0,0 +1,159 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
import itertools
|
||||
|
||||
import torch
|
||||
|
||||
from vllm import _custom_ops as ops
|
||||
from vllm.model_executor.layers.fused_moe.moe_align_block_size import (
|
||||
moe_align_block_size_triton,
|
||||
)
|
||||
from vllm.triton_utils import triton
|
||||
|
||||
|
||||
def get_topk_ids(num_tokens: int, num_experts: int, topk: int) -> torch.Tensor:
|
||||
return torch.stack(
|
||||
[
|
||||
torch.randperm(num_experts, dtype=torch.int32, device="cuda")[:topk]
|
||||
for _ in range(num_tokens)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def check_correctness(num_tokens, num_experts=256, block_size=256, topk=8):
|
||||
"""
|
||||
Verifies vllm vs. Triton
|
||||
"""
|
||||
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
|
||||
|
||||
# 1. malloc space for triton and vllm
|
||||
# malloc enough space (max_num_tokens_padded) for the sorted ids
|
||||
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
|
||||
sorted_ids_triton = torch.empty(
|
||||
(max_num_tokens_padded,), dtype=torch.int32, device="cuda"
|
||||
)
|
||||
sorted_ids_triton.fill_(topk_ids.numel()) # fill with sentinel value
|
||||
expert_ids_triton = torch.zeros(
|
||||
(max_num_tokens_padded // block_size,), dtype=torch.int32, device="cuda"
|
||||
)
|
||||
num_tokens_post_pad_triton = torch.empty((1,), dtype=torch.int32, device="cuda")
|
||||
|
||||
sorted_ids_vllm = torch.empty_like(sorted_ids_triton)
|
||||
sorted_ids_vllm.fill_(topk_ids.numel())
|
||||
expert_ids_vllm = torch.zeros_like(expert_ids_triton)
|
||||
num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_triton)
|
||||
|
||||
# 2. run implementations
|
||||
moe_align_block_size_triton(
|
||||
topk_ids,
|
||||
num_experts,
|
||||
block_size,
|
||||
sorted_ids_triton,
|
||||
expert_ids_triton,
|
||||
num_tokens_post_pad_triton,
|
||||
)
|
||||
|
||||
ops.moe_align_block_size(
|
||||
topk_ids,
|
||||
num_experts,
|
||||
block_size,
|
||||
sorted_ids_vllm,
|
||||
expert_ids_vllm,
|
||||
num_tokens_post_pad_vllm,
|
||||
)
|
||||
print(f"✅ VLLM implementation works with {num_experts} experts!")
|
||||
|
||||
# 3. compare results
|
||||
if torch.allclose(expert_ids_triton, expert_ids_vllm) and torch.allclose(
|
||||
num_tokens_post_pad_triton, num_tokens_post_pad_vllm
|
||||
):
|
||||
print("✅ Triton and VLLM implementations match.")
|
||||
else:
|
||||
print("❌ Triton and VLLM implementations DO NOT match.")
|
||||
print("Triton expert_ids:", expert_ids_triton)
|
||||
print("VLLM expert_ids:", expert_ids_vllm)
|
||||
print("Triton num_tokens_post_pad:", num_tokens_post_pad_triton)
|
||||
print("VLLM num_tokens_post_pad:", num_tokens_post_pad_vllm)
|
||||
|
||||
|
||||
# test configurations
|
||||
num_tokens_range = [1, 16, 256, 4096]
|
||||
num_experts_range = [16, 64, 224, 256, 280, 512]
|
||||
topk_range = [1, 2, 8]
|
||||
configs = list(itertools.product(num_tokens_range, num_experts_range, topk_range))
|
||||
|
||||
|
||||
@triton.testing.perf_report(
|
||||
triton.testing.Benchmark(
|
||||
x_names=["num_tokens", "num_experts", "topk"],
|
||||
x_vals=configs,
|
||||
line_arg="provider",
|
||||
line_vals=["vllm", "triton"], # "triton"
|
||||
line_names=["VLLM", "Triton"], # "Triton"
|
||||
plot_name="moe-align-block-size-performance",
|
||||
args={},
|
||||
)
|
||||
)
|
||||
def benchmark(num_tokens, num_experts, topk, provider):
|
||||
"""Benchmark function for Triton."""
|
||||
block_size = 256
|
||||
topk_ids = get_topk_ids(num_tokens, num_experts, topk)
|
||||
|
||||
max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
|
||||
sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device="cuda")
|
||||
sorted_ids.fill_(topk_ids.numel())
|
||||
max_num_m_blocks = max_num_tokens_padded // block_size
|
||||
expert_ids = torch.empty((max_num_m_blocks,), dtype=torch.int32, device="cuda")
|
||||
num_tokens_post_pad = torch.empty((1,), dtype=torch.int32, device="cuda")
|
||||
|
||||
quantiles = [0.5, 0.2, 0.8]
|
||||
|
||||
if provider == "vllm":
|
||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||
lambda: ops.moe_align_block_size(
|
||||
topk_ids,
|
||||
num_experts,
|
||||
block_size,
|
||||
sorted_ids.clone(),
|
||||
expert_ids.clone(),
|
||||
num_tokens_post_pad.clone(),
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
elif provider == "triton":
|
||||
ms, min_ms, max_ms = triton.testing.do_bench(
|
||||
lambda: moe_align_block_size_triton(
|
||||
topk_ids,
|
||||
num_experts,
|
||||
block_size,
|
||||
sorted_ids.clone(),
|
||||
expert_ids.clone(),
|
||||
num_tokens_post_pad.clone(),
|
||||
),
|
||||
quantiles=quantiles,
|
||||
)
|
||||
|
||||
return 1000 * ms, 1000 * max_ms, 1000 * min_ms
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--num_experts",
|
||||
type=int,
|
||||
default=64,
|
||||
choices=[8, 16, 32, 64, 128, 256],
|
||||
)
|
||||
parser.add_argument(
|
||||
"--topk",
|
||||
type=int,
|
||||
default=8,
|
||||
choices=[2, 4, 8],
|
||||
help="Top-k value for correctness check.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
print("Running correctness check...")
|
||||
check_correctness(num_tokens=1024, num_experts=args.num_experts, topk=args.topk)
|
||||
benchmark.run(print_data=True, show_plots=True)
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
from typing import Any, TypedDict
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import random
|
||||
import time
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import time
|
||||
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import itertools
|
||||
from typing import Optional, Union
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from itertools import accumulate
|
||||
from typing import Optional
|
||||
@ -22,7 +23,7 @@ def benchmark_rope_kernels_multi_lora(
|
||||
seed: int,
|
||||
device: str,
|
||||
max_position: int = 8192,
|
||||
base: int = 10000,
|
||||
base: float = 10000,
|
||||
) -> None:
|
||||
current_platform.seed_everything(seed)
|
||||
torch.set_default_device(device)
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
WEIGHT_SHAPES = {
|
||||
"ideal": [[4 * 256 * 32, 256 * 32]],
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# Adapted from sglang quantization/tuning_block_wise_kernel.py
|
||||
|
||||
import argparse
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# fmt: off
|
||||
# ruff: noqa: E501
|
||||
import time
|
||||
@ -84,12 +85,6 @@ def benchmark_shape(m: int,
|
||||
|
||||
# === DeepGEMM Implementation ===
|
||||
def deepgemm_gemm():
|
||||
# A quantization is inside the loop as it depends on activations
|
||||
# A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A)
|
||||
# A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(
|
||||
# A, block_size[1])
|
||||
# A_scale_aligned = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
|
||||
# C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
|
||||
deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm),
|
||||
(B_deepgemm, B_scale_deepgemm),
|
||||
C_deepgemm)
|
||||
@ -97,8 +92,6 @@ def benchmark_shape(m: int,
|
||||
|
||||
# === vLLM Triton Implementation ===
|
||||
def vllm_triton_gemm():
|
||||
# A quantization is inside the loop as it depends on activations
|
||||
# A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
|
||||
return w8a8_block_fp8_matmul(A_vllm,
|
||||
B_vllm,
|
||||
A_scale_vllm,
|
||||
@ -108,9 +101,6 @@ def benchmark_shape(m: int,
|
||||
|
||||
# === vLLM CUTLASS Implementation ===
|
||||
def vllm_cutlass_gemm():
|
||||
# A quantization is inside the loop as it depends on activations
|
||||
# A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
|
||||
# A, block_size[1], column_major_scales=True)
|
||||
return ops.cutlass_scaled_mm(A_vllm_cutlass,
|
||||
B_vllm.T,
|
||||
scale_a=A_scale_vllm_cutlass,
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import math
|
||||
import pickle
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import dataclasses
|
||||
from collections.abc import Iterable
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Weight Shapes are in the format
|
||||
# ([K, N], TP_SPLIT_DIM)
|
||||
@ -48,4 +49,50 @@ WEIGHT_SHAPES = {
|
||||
([16384, 106496], 1),
|
||||
([53248, 16384], 0),
|
||||
],
|
||||
"meta-llama/Llama-3.1-8B-Instruct": [
|
||||
([4096, 6144], 1),
|
||||
([4096, 4096], 0),
|
||||
([4096, 28672], 1),
|
||||
([14336, 4096], 0),
|
||||
],
|
||||
"meta-llama/Llama-3.3-70B-Instruct": [
|
||||
([8192, 10240], 1),
|
||||
([8192, 8192], 0),
|
||||
([8192, 57344], 1),
|
||||
([28672, 8192], 0),
|
||||
],
|
||||
"mistralai/Mistral-Large-Instruct-2407": [
|
||||
([12288, 14336], 1),
|
||||
([12288, 12288], 0),
|
||||
([12288, 57344], 1),
|
||||
([28672, 12288], 0),
|
||||
],
|
||||
"Qwen/Qwen2.5-7B-Instruct": [
|
||||
([3584, 4608], 1),
|
||||
([3584, 3584], 0),
|
||||
([3584, 37888], 1),
|
||||
([18944, 3584], 0),
|
||||
],
|
||||
"Qwen/Qwen2.5-32B-Instruct": [
|
||||
([5120, 7168], 1),
|
||||
([5120, 5120], 0),
|
||||
([5120, 55296], 1),
|
||||
([27648, 5120], 0),
|
||||
],
|
||||
"Qwen/Qwen2.5-72B-Instruct": [
|
||||
([8192, 10240], 1),
|
||||
([8192, 8192], 0),
|
||||
([8192, 59136], 1),
|
||||
([29568, 8192], 0),
|
||||
],
|
||||
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": [
|
||||
([2048, 3072], 1),
|
||||
([2048, 4096], 1),
|
||||
([2048, 2048], 0),
|
||||
([2048, 576], 0),
|
||||
([2048, 21888], 1),
|
||||
([10944, 2048], 0),
|
||||
([2048, 2816], 1),
|
||||
([1408, 2048], 0),
|
||||
],
|
||||
}
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import cProfile
|
||||
import pstats
|
||||
|
||||
@ -75,6 +75,7 @@ if (MACOSX_FOUND AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64")
|
||||
else()
|
||||
find_isa(${CPUINFO} "avx2" AVX2_FOUND)
|
||||
find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
|
||||
find_isa(${CPUINFO} "Power11" POWER11_FOUND)
|
||||
find_isa(${CPUINFO} "POWER10" POWER10_FOUND)
|
||||
find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
|
||||
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
|
||||
@ -106,13 +107,19 @@ elseif (AVX2_FOUND)
|
||||
list(APPEND CXX_COMPILE_FLAGS "-mavx2")
|
||||
message(WARNING "vLLM CPU backend using AVX2 ISA")
|
||||
|
||||
elseif (POWER9_FOUND OR POWER10_FOUND)
|
||||
elseif (POWER9_FOUND OR POWER10_FOUND OR POWER11_FOUND)
|
||||
message(STATUS "PowerPC detected")
|
||||
# Check for PowerPC VSX support
|
||||
list(APPEND CXX_COMPILE_FLAGS
|
||||
"-mvsx"
|
||||
"-mcpu=native"
|
||||
"-mtune=native")
|
||||
if (POWER9_FOUND)
|
||||
list(APPEND CXX_COMPILE_FLAGS
|
||||
"-mvsx"
|
||||
"-mcpu=power9"
|
||||
"-mtune=power9")
|
||||
elseif (POWER10_FOUND OR POWER11_FOUND)
|
||||
list(APPEND CXX_COMPILE_FLAGS
|
||||
"-mvsx"
|
||||
"-mcpu=power10"
|
||||
"-mtune=power10")
|
||||
endif()
|
||||
|
||||
elseif (ASIMD_FOUND)
|
||||
message(STATUS "ARMv8 or later architecture detected")
|
||||
|
||||
@ -38,7 +38,7 @@ else()
|
||||
FetchContent_Declare(
|
||||
vllm-flash-attn
|
||||
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
|
||||
GIT_TAG 8798f27777fb57f447070301bf33a9f9c607f491
|
||||
GIT_TAG 5f3644181c7a15345ce20bfc65af117d3601b524
|
||||
GIT_PROGRESS TRUE
|
||||
# Don't share the vllm-flash-attn build between build types
|
||||
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
|
||||
@ -46,22 +46,38 @@ else()
|
||||
endif()
|
||||
|
||||
|
||||
# Ensure the vllm/vllm_flash_attn directory exists before installation
|
||||
install(CODE "file(MAKE_DIRECTORY \"\${CMAKE_INSTALL_PREFIX}/vllm/vllm_flash_attn\")" ALL_COMPONENTS)
|
||||
|
||||
# Make sure vllm-flash-attn install rules are nested under vllm/
|
||||
# This is here to support installing all components under the same prefix with cmake --install.
|
||||
# setup.py installs every component separately but uses the same prefix for all.
|
||||
# ALL_COMPONENTS is used to avoid duplication for FA2 and FA3,
|
||||
# and these statements don't hurt when installing neither component.
|
||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY FALSE)" ALL_COMPONENTS)
|
||||
install(CODE "set(OLD_CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
|
||||
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${CMAKE_INSTALL_PREFIX}/vllm/\")" ALL_COMPONENTS)
|
||||
|
||||
# Fetch the vllm-flash-attn library
|
||||
FetchContent_MakeAvailable(vllm-flash-attn)
|
||||
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
|
||||
|
||||
# Restore the install prefix
|
||||
install(CODE "set(CMAKE_INSTALL_PREFIX \"\${OLD_CMAKE_INSTALL_PREFIX}\")" ALL_COMPONENTS)
|
||||
install(CODE "set(CMAKE_INSTALL_LOCAL_ONLY TRUE)" ALL_COMPONENTS)
|
||||
|
||||
# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
|
||||
# case only one is built, in the case both are built redundant work is done)
|
||||
install(
|
||||
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
||||
DESTINATION vllm_flash_attn
|
||||
DESTINATION vllm/vllm_flash_attn
|
||||
COMPONENT _vllm_fa2_C
|
||||
FILES_MATCHING PATTERN "*.py"
|
||||
)
|
||||
|
||||
install(
|
||||
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
|
||||
DESTINATION vllm_flash_attn
|
||||
DESTINATION vllm/vllm_flash_attn
|
||||
COMPONENT _vllm_fa3_C
|
||||
FILES_MATCHING PATTERN "*.py"
|
||||
)
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
#!/usr/bin/env python3
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
#
|
||||
# A command line tool for running pytorch's hipify preprocessor on CUDA
|
||||
|
||||
@ -76,7 +76,7 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
|
||||
set(CSRC_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}/csrc)
|
||||
add_custom_target(
|
||||
hipify${NAME}
|
||||
COMMAND ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
|
||||
COMMAND ${Python_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/hipify.py -p ${CMAKE_SOURCE_DIR}/csrc -o ${CSRC_BUILD_DIR} ${SRCS}
|
||||
DEPENDS ${CMAKE_SOURCE_DIR}/cmake/hipify.py ${SRCS}
|
||||
BYPRODUCTS ${HIP_SRCS}
|
||||
COMMENT "Running hipify on ${NAME} extension source files.")
|
||||
@ -122,6 +122,7 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
|
||||
"-DENABLE_FP8"
|
||||
"-U__HIP_NO_HALF_CONVERSIONS__"
|
||||
"-U__HIP_NO_HALF_OPERATORS__"
|
||||
"-Werror=unused-variable"
|
||||
"-fno-gpu-rdc")
|
||||
|
||||
endif()
|
||||
|
||||
@ -143,6 +143,14 @@ void merge_attn_states_launcher(torch::Tensor& output,
|
||||
const uint pack_size = 16 / sizeof(scalar_t);
|
||||
TORCH_CHECK(head_size % pack_size == 0,
|
||||
"headsize must be multiple of pack_size:", pack_size);
|
||||
TORCH_CHECK(output.stride(-2) == head_size && output.stride(-1) == 1,
|
||||
"output heads must be contiguous in memory");
|
||||
TORCH_CHECK(
|
||||
prefix_output.stride(-2) == head_size && prefix_output.stride(-1) == 1,
|
||||
"prefix_output heads must be contiguous in memory");
|
||||
TORCH_CHECK(
|
||||
suffix_output.stride(-2) == head_size && suffix_output.stride(-1) == 1,
|
||||
"suffix_output heads must be contiguous in memory");
|
||||
float* output_lse_ptr = nullptr;
|
||||
if (output_lse.has_value()) {
|
||||
output_lse_ptr = output_lse.value().data_ptr<float>();
|
||||
|
||||
@ -119,7 +119,7 @@ typename T::Fmha::Arguments args_from_options(
|
||||
{static_cast<ElementOut*>(out.data_ptr()), stride_O,
|
||||
static_cast<ElementAcc*>(nullptr), stride_LSE},
|
||||
hw_info,
|
||||
-1, // split_kv
|
||||
1, // split_kv
|
||||
nullptr, // is_var_split_kv
|
||||
};
|
||||
// TODO(kaixih@nvidia): When split_kv=-1 and is_var_split_kv=false, we compute
|
||||
@ -207,7 +207,7 @@ void cutlass_mla_decode_sm100a(torch::Tensor const& out,
|
||||
"page_table must be a 32-bit integer tensor");
|
||||
|
||||
auto in_dtype = q_nope.dtype();
|
||||
at::cuda::CUDAGuard device_guard{(char)q_nope.get_device()};
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(q_nope));
|
||||
const cudaStream_t stream =
|
||||
at::cuda::getCurrentCUDAStream(q_nope.get_device());
|
||||
if (in_dtype == at::ScalarType::Half) {
|
||||
|
||||
@ -65,9 +65,6 @@ void paged_attention_v1_launcher(
|
||||
int kv_block_stride = key_cache.stride(0);
|
||||
int kv_head_stride = key_cache.stride(1);
|
||||
|
||||
[[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
|
||||
assert(head_size % thread_group_size == 0);
|
||||
|
||||
// NOTE: alibi_slopes is optional.
|
||||
const float* alibi_slopes_ptr =
|
||||
alibi_slopes
|
||||
@ -193,4 +190,4 @@ void paged_attention_v1(
|
||||
#undef WARP_SIZE
|
||||
#undef MAX
|
||||
#undef MIN
|
||||
#undef DIVIDE_ROUND_UP
|
||||
#undef DIVIDE_ROUND_UP
|
||||
|
||||
@ -66,9 +66,6 @@ void paged_attention_v2_launcher(
|
||||
int kv_block_stride = key_cache.stride(0);
|
||||
int kv_head_stride = key_cache.stride(1);
|
||||
|
||||
[[maybe_unused]] int thread_group_size = MAX(WARP_SIZE / BLOCK_SIZE, 1);
|
||||
assert(head_size % thread_group_size == 0);
|
||||
|
||||
// NOTE: alibi_slopes is optional.
|
||||
const float* alibi_slopes_ptr =
|
||||
alibi_slopes
|
||||
@ -203,4 +200,4 @@ void paged_attention_v2(
|
||||
#undef WARP_SIZE
|
||||
#undef MAX
|
||||
#undef MIN
|
||||
#undef DIVIDE_ROUND_UP
|
||||
#undef DIVIDE_ROUND_UP
|
||||
|
||||
@ -137,8 +137,8 @@ FORCE_INLINE std::pair<T, T> reduceSoftmaxAlibi(T* data, const int size,
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
FORCE_INLINE void reducePartitonSoftmax(const T* max_data, T* sum_data,
|
||||
const int size) {
|
||||
FORCE_INLINE void reducePartitionSoftmax(const T* max_data, T* sum_data,
|
||||
const int size) {
|
||||
T max = max_data[0];
|
||||
for (int i = 1; i < size; ++i) {
|
||||
max = max >= max_data[i] ? max : max_data[i];
|
||||
@ -634,7 +634,7 @@ struct paged_attention_v2_impl {
|
||||
|
||||
if (partition_num == 1) continue;
|
||||
|
||||
reducePartitonSoftmax(
|
||||
reducePartitionSoftmax(
|
||||
max_logits + seq_idx * num_heads * max_num_partitions +
|
||||
head_idx * max_num_partitions,
|
||||
exp_sums + seq_idx * num_heads * max_num_partitions +
|
||||
|
||||
@ -83,7 +83,7 @@ struct FP16Vec16 : public Vec<FP16Vec16> {
|
||||
explicit FP16Vec16(const void* ptr)
|
||||
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
|
||||
|
||||
// non-temproal load
|
||||
// non-temporal load
|
||||
explicit FP16Vec16(bool, void* ptr)
|
||||
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
|
||||
|
||||
@ -120,7 +120,7 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
|
||||
explicit BF16Vec16(const void* ptr)
|
||||
: reg((__m256i)_mm256_loadu_si256((__m256i*)ptr)) {}
|
||||
|
||||
// non-temproal load
|
||||
// non-temporal load
|
||||
explicit BF16Vec16(bool, void* ptr)
|
||||
: reg(_mm256_stream_load_si256((__m256i*)ptr)) {}
|
||||
|
||||
@ -327,7 +327,7 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
|
||||
// normal load
|
||||
explicit FP32Vec16(const float* ptr) : reg(_mm512_loadu_ps(ptr)) {}
|
||||
|
||||
// non-temproal load
|
||||
// non-temporal load
|
||||
explicit FP32Vec16(bool, void* ptr)
|
||||
: reg((__m512)_mm512_stream_load_si512(ptr)) {}
|
||||
|
||||
@ -576,7 +576,7 @@ struct INT8Vec64 : public Vec<INT8Vec64> {
|
||||
// normal load
|
||||
explicit INT8Vec64(void* ptr) : reg(_mm512_loadu_epi8(ptr)) {}
|
||||
|
||||
// non-temproal load
|
||||
// non-temporal load
|
||||
explicit INT8Vec64(bool, void* ptr) : reg(_mm512_stream_load_si512(ptr)) {}
|
||||
|
||||
void save(void* ptr) const { _mm512_storeu_epi8(ptr, reg); }
|
||||
@ -587,7 +587,7 @@ struct INT8Vec64 : public Vec<INT8Vec64> {
|
||||
_mm512_mask_storeu_epi8(ptr, mask, reg);
|
||||
}
|
||||
|
||||
// non-temproal save
|
||||
// non-temporal save
|
||||
void nt_save(int8_t* ptr) { _mm512_stream_si512((__m512i*)ptr, reg); }
|
||||
};
|
||||
#endif
|
||||
|
||||
@ -131,16 +131,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
|
||||
// Quantization
|
||||
#ifdef __AVX512F__
|
||||
at::Tag stride_tag = at::Tag::needs_fixed_stride_order;
|
||||
// Compute int8 quantized tensor for given scaling factor.
|
||||
ops.def(
|
||||
"static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
|
||||
"Tensor? azp) -> ()");
|
||||
"Tensor? azp) -> ()",
|
||||
{stride_tag});
|
||||
ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
|
||||
|
||||
// Compute int8 quantized tensor and scaling factor
|
||||
ops.def(
|
||||
"dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
|
||||
"Tensor!? azp) -> ()");
|
||||
"Tensor!? azp) -> ()",
|
||||
{stride_tag});
|
||||
ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
|
||||
&dynamic_scaled_int8_quant);
|
||||
// W8A8 GEMM, supporting symmetric per-tensor or per-row/column
|
||||
@ -148,7 +151,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
ops.def(
|
||||
"cutlass_scaled_mm(Tensor! out, Tensor a,"
|
||||
" Tensor b, Tensor a_scales,"
|
||||
" Tensor b_scales, Tensor? bias) -> ()");
|
||||
" Tensor b_scales, Tensor? bias) -> ()",
|
||||
{stride_tag});
|
||||
ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm);
|
||||
// w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
|
||||
// quantization.
|
||||
@ -156,7 +160,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
|
||||
"cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
|
||||
" Tensor b, Tensor a_scales,"
|
||||
" Tensor b_scales, Tensor azp_adj,"
|
||||
" Tensor? azp, Tensor? bias) -> ()");
|
||||
" Tensor? azp, Tensor? bias) -> ()",
|
||||
{stride_tag});
|
||||
ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
|
||||
#elif defined(__powerpc64__)
|
||||
// Compute int8 quantized tensor for given scaling factor.
|
||||
|
||||
@ -54,8 +54,7 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
||||
*(src_mask->maskp) = *(src_mask->maskp) ^ *(mask->maskp);
|
||||
int page_num = numa_migrate_pages(pid, src_mask, mask);
|
||||
if (page_num == -1) {
|
||||
TORCH_CHECK(false,
|
||||
"numa_migrate_pages failed. errno: " + std::to_string(errno));
|
||||
TORCH_WARN("numa_migrate_pages failed. errno: " + std::to_string(errno));
|
||||
}
|
||||
|
||||
// restrict memory allocation node.
|
||||
@ -105,4 +104,4 @@ std::string init_cpu_threads_env(const std::string& cpu_ids) {
|
||||
|
||||
return ss.str();
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
114
csrc/custom_quickreduce.cu
Normal file
114
csrc/custom_quickreduce.cu
Normal file
@ -0,0 +1,114 @@
|
||||
#include <ATen/cuda/Exceptions.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
#include <c10/cuda/CUDAStream.h>
|
||||
#include <torch/all.h>
|
||||
|
||||
#ifdef USE_ROCM
|
||||
|
||||
#include "quickreduce/quick_reduce.h"
|
||||
|
||||
quickreduce::fptr_t init_custom_qr(int64_t rank, int64_t world_size,
|
||||
std::optional<int64_t> qr_max_size) {
|
||||
if (world_size > 8)
|
||||
throw std::invalid_argument("world size > 8 is not supported");
|
||||
if (world_size == 6)
|
||||
throw std::invalid_argument("world size == 6 is not supported");
|
||||
if (world_size % 2 != 0)
|
||||
throw std::invalid_argument("Odd num gpus is not supported for now");
|
||||
if (rank < 0 || rank >= world_size)
|
||||
throw std::invalid_argument("invalid rank passed in");
|
||||
quickreduce::DeviceComms* fptr = new quickreduce::DeviceComms();
|
||||
fptr->init(world_size, rank, qr_max_size);
|
||||
return (quickreduce::fptr_t)fptr;
|
||||
}
|
||||
|
||||
void qr_destroy(quickreduce::fptr_t _fa) {
|
||||
if (_fa) {
|
||||
auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa);
|
||||
fa->destroy();
|
||||
delete fa;
|
||||
}
|
||||
}
|
||||
|
||||
torch::Tensor qr_get_handle(quickreduce::fptr_t _fa) {
|
||||
auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa);
|
||||
hipIpcMemHandle_t handle = fa->get_handle();
|
||||
auto options =
|
||||
torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
|
||||
auto data_handle =
|
||||
torch::empty({static_cast<int64_t>(sizeof(hipIpcMemHandle_t))}, options);
|
||||
std::memcpy(data_handle.data_ptr(), &handle, sizeof(hipIpcMemHandle_t));
|
||||
return data_handle;
|
||||
}
|
||||
|
||||
void qr_open_handles(quickreduce::fptr_t _fa,
|
||||
const std::vector<torch::Tensor>& handles) {
|
||||
auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa);
|
||||
std::vector<hipIpcMemHandle_t> ipc_handles;
|
||||
ipc_handles.reserve(handles.size());
|
||||
for (auto& handle : handles) {
|
||||
// Ensure the tensor is on the same device as the current device.
|
||||
hipIpcMemHandle_t ipc_handle;
|
||||
std::memcpy(&ipc_handle, handle.data_ptr(), sizeof(hipIpcMemHandle_t));
|
||||
ipc_handles.push_back(ipc_handle);
|
||||
}
|
||||
fa->open_ipc_handles(ipc_handles);
|
||||
}
|
||||
|
||||
void qr_all_reduce(quickreduce::fptr_t _fa, torch::Tensor& inp,
|
||||
torch::Tensor& out, int64_t quant_level, bool cast_bf2half) {
|
||||
auto fa = reinterpret_cast<quickreduce::DeviceComms*>(_fa);
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(inp));
|
||||
auto stream = at::cuda::getCurrentHIPStreamMasqueradingAsCUDA();
|
||||
|
||||
TORCH_CHECK_EQ(inp.scalar_type(), out.scalar_type());
|
||||
TORCH_CHECK_EQ(inp.numel(), out.numel());
|
||||
TORCH_CHECK_LE(out.numel(), fa->kMaxProblemSize);
|
||||
if (out.scalar_type() == at::ScalarType::Half) {
|
||||
fa->allreduce<half, false>(reinterpret_cast<half*>(inp.data_ptr()),
|
||||
reinterpret_cast<half*>(out.data_ptr()),
|
||||
out.numel(), quant_level, stream);
|
||||
} else if (out.scalar_type() == at::ScalarType::BFloat16) {
|
||||
if (cast_bf2half) {
|
||||
fa->allreduce<half, true>(reinterpret_cast<half*>(inp.data_ptr()),
|
||||
reinterpret_cast<half*>(out.data_ptr()),
|
||||
out.numel(), quant_level, stream);
|
||||
} else {
|
||||
fa->allreduce<quickreduce::nv_bfloat16, false>(
|
||||
reinterpret_cast<quickreduce::nv_bfloat16*>(inp.data_ptr()),
|
||||
reinterpret_cast<quickreduce::nv_bfloat16*>(out.data_ptr()),
|
||||
out.numel(), quant_level, stream);
|
||||
}
|
||||
} else {
|
||||
throw std::runtime_error(
|
||||
"quick allreduce only supports float16 and bfloat16");
|
||||
}
|
||||
}
|
||||
|
||||
int64_t qr_max_size() {
|
||||
// The default is 2GB (2,147,483,648 bytes)
|
||||
return static_cast<int64_t>(std::numeric_limits<int32_t>::max()) + 1;
|
||||
}
|
||||
|
||||
#define INSTANTIATE_FOR_WORLDSIZE(T, Codec, cast_bf2half) \
|
||||
template struct quickreduce::AllReduceTwoshot<T, Codec<T, 2>, \
|
||||
cast_bf2half>; \
|
||||
template struct quickreduce::AllReduceTwoshot<T, Codec<T, 4>, \
|
||||
cast_bf2half>; \
|
||||
template struct quickreduce::AllReduceTwoshot<T, Codec<T, 8>, cast_bf2half>;
|
||||
|
||||
INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecFP, false)
|
||||
INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ4, false)
|
||||
INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ6, false)
|
||||
INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ8, false)
|
||||
INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecFP, true)
|
||||
INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ4, true)
|
||||
INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ6, true)
|
||||
INSTANTIATE_FOR_WORLDSIZE(quickreduce::nv_bfloat16, quickreduce::CodecQ8, true)
|
||||
|
||||
INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecFP, false)
|
||||
INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecQ4, false)
|
||||
INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecQ6, false)
|
||||
INSTANTIATE_FOR_WORLDSIZE(half, quickreduce::CodecQ8, false)
|
||||
|
||||
#endif // USE_ROCM
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import enum
|
||||
from typing import Union
|
||||
|
||||
@ -185,9 +185,7 @@ void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight,
|
||||
params.conv_states_ptr = nullptr;
|
||||
}
|
||||
|
||||
// Otherwise the kernel will be launched from cuda:0 device
|
||||
// Cast to char to avoid compiler warning about narrowing
|
||||
at::cuda::CUDAGuard device_guard{(char)x.get_device()};
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
|
||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||
DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_fwd", [&] {
|
||||
causal_conv1d_fwd_cuda<input_t, weight_t>(params, stream);
|
||||
@ -278,9 +276,7 @@ void causal_conv1d_update(const at::Tensor &x,
|
||||
params.conv_state_indices_ptr = nullptr;
|
||||
}
|
||||
|
||||
// Otherwise the kernel will be launched from cuda:0 device
|
||||
// Cast to char to avoid compiler warning about narrowing
|
||||
at::cuda::CUDAGuard device_guard{(char)x.get_device()};
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
|
||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||
DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(x.scalar_type(), "causal_conv1d_update", [&] {
|
||||
causal_conv1d_update_cuda<input_t, weight_t>(params, stream);
|
||||
|
||||
@ -647,9 +647,7 @@ void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta,
|
||||
);
|
||||
|
||||
|
||||
// Otherwise the kernel will be launched from cuda:0 device
|
||||
// Cast to char to avoid compiler warning about narrowing
|
||||
at::cuda::CUDAGuard device_guard{(char)u.get_device()};
|
||||
const at::cuda::OptionalCUDAGuard device_guard(device_of(u));
|
||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||
DISPATCH_WTYPE_ITYPE_FLOAT_AND_HALF_AND_BF16(u.scalar_type(), "selective_scan_fwd", [&] {
|
||||
selective_scan_fwd_cuda<input_t, weight_t>(params, stream);
|
||||
|
||||
@ -1,4 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import glob
|
||||
import itertools
|
||||
import os
|
||||
|
||||
@ -13,232 +13,45 @@
|
||||
namespace vllm {
|
||||
namespace moe {
|
||||
|
||||
namespace {
|
||||
__device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
|
||||
int32_t col) {
|
||||
// don't worry about overflow because num_experts is relatively small
|
||||
return row * total_col + col;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
template <typename scalar_t, typename token_cnts_t>
|
||||
__global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
|
||||
int32_t* sorted_token_ids,
|
||||
int32_t* expert_ids,
|
||||
int32_t* total_tokens_post_pad,
|
||||
int32_t num_experts,
|
||||
int32_t block_size, size_t numel) {
|
||||
const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
|
||||
const size_t start_idx = threadIdx.x * tokens_per_thread;
|
||||
|
||||
extern __shared__ int32_t shared_mem[];
|
||||
int32_t* cumsum = shared_mem; // 1d tensor with shape (num_experts + 1)
|
||||
token_cnts_t* tokens_cnts =
|
||||
(token_cnts_t*)(shared_mem + num_experts +
|
||||
1); // 2d tensor with shape (blockDim.x + 1, num_experts)
|
||||
|
||||
for (int i = 0; i < num_experts; ++i) {
|
||||
tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* In the first step we compute token_cnts[thread_index + 1][expert_index],
|
||||
* which counts how many tokens in the token shard of thread_index are
|
||||
* assigned to expert expert_index.
|
||||
*/
|
||||
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
|
||||
++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// For each expert we accumulate the token counts from the different threads.
|
||||
if (threadIdx.x < num_experts) {
|
||||
tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
|
||||
for (int i = 1; i <= blockDim.x; ++i) {
|
||||
tokens_cnts[index(num_experts, i, threadIdx.x)] +=
|
||||
tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// We accumulate the token counts of all experts in thread 0.
|
||||
if (threadIdx.x == 0) {
|
||||
cumsum[0] = 0;
|
||||
for (int i = 1; i <= num_experts; ++i) {
|
||||
cumsum[i] = cumsum[i - 1] +
|
||||
CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
|
||||
block_size) *
|
||||
block_size;
|
||||
}
|
||||
*total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
/**
|
||||
* For each expert, each thread processes the tokens of the corresponding
|
||||
* blocks and stores the corresponding expert_id for each block.
|
||||
*/
|
||||
if (threadIdx.x < num_experts) {
|
||||
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
|
||||
i += block_size) {
|
||||
expert_ids[i / block_size] = threadIdx.x;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Each thread processes a token shard, calculating the index of each token
|
||||
* after sorting by expert number. Given the example topk_ids =
|
||||
* [0,1,2,1,2,3,0,3,4] and block_size = 4, then the output would be [0, 6, *,
|
||||
* *, 1, 3, *, *, 2, 4, *, *, 5, 7, *, *, 8, *, *, *], where * represents a
|
||||
* padding value(preset in python).
|
||||
*/
|
||||
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
|
||||
int32_t expert_id = topk_ids[i];
|
||||
/** The cumsum[expert_id] stores the starting index of the tokens that the
|
||||
* expert with expert_id needs to process, and
|
||||
* tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
|
||||
* processed by the expert with expert_id within the current thread's token
|
||||
* shard.
|
||||
*/
|
||||
int32_t rank_post_pad =
|
||||
tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
|
||||
cumsum[expert_id];
|
||||
sorted_token_ids[rank_post_pad] = i;
|
||||
++tokens_cnts[index(num_experts, threadIdx.x, expert_id)];
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(simon): this is temporarily adapted from
|
||||
// https://github.com/sgl-project/sglang/commit/31548116a8dc8c6df7e146e0587335a59fc5b9d7
|
||||
// we did this to unblock Deepseek V3 but there should be a better
|
||||
// implementation to manage shared memory.
|
||||
template <typename scalar_t>
|
||||
__global__ void moe_align_block_size_global_mem_kernel(
|
||||
scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids,
|
||||
int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts,
|
||||
int32_t block_size, size_t numel, int32_t* tokens_cnts, int32_t* cumsum) {
|
||||
const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
|
||||
const size_t start_idx = threadIdx.x * tokens_per_thread;
|
||||
__global__ void moe_align_block_size_kernel(
|
||||
const scalar_t* __restrict__ topk_ids,
|
||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
|
||||
int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
|
||||
int32_t padded_num_experts, int32_t experts_per_warp, int32_t block_size,
|
||||
size_t numel, int32_t* __restrict__ cumsum) {
|
||||
extern __shared__ int32_t shared_counts[];
|
||||
|
||||
for (int i = 0; i < num_experts; ++i) {
|
||||
tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* In the first step we compute token_cnts[thread_index + 1][expert_index],
|
||||
* which counts how many tokens in the token shard of thread_index are
|
||||
* assigned to expert expert_index.
|
||||
*/
|
||||
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
|
||||
++tokens_cnts[index(num_experts, threadIdx.x + 1, topk_ids[i])];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// For each expert we accumulate the token counts from the different threads.
|
||||
if (threadIdx.x < num_experts) {
|
||||
tokens_cnts[index(num_experts, 0, threadIdx.x)] = 0;
|
||||
for (int i = 1; i <= blockDim.x; ++i) {
|
||||
tokens_cnts[index(num_experts, i, threadIdx.x)] +=
|
||||
tokens_cnts[index(num_experts, i - 1, threadIdx.x)];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// We accumulate the token counts of all experts in thread 0.
|
||||
if (threadIdx.x == 0) {
|
||||
cumsum[0] = 0;
|
||||
for (int i = 1; i <= num_experts; ++i) {
|
||||
cumsum[i] = cumsum[i - 1] +
|
||||
CEILDIV(tokens_cnts[index(num_experts, blockDim.x, i - 1)],
|
||||
block_size) *
|
||||
block_size;
|
||||
}
|
||||
*total_tokens_post_pad = cumsum[num_experts];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
/**
|
||||
* For each expert, each thread processes the tokens of the corresponding
|
||||
* blocks and stores the corresponding expert_id for each block.
|
||||
*/
|
||||
if (threadIdx.x < num_experts) {
|
||||
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
|
||||
i += block_size) {
|
||||
expert_ids[i / block_size] = threadIdx.x;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Each thread processes a token shard, calculating the index of each token
|
||||
* after sorting by expert number. Given the example topk_ids =
|
||||
* [0,1,2,1,2,3,0,3,4] and block_size = 4, then the output would be [0, 6, *,
|
||||
* *, 1, 3, *, *, 2, 4, *, *, 5, 7, *, *, 8, *, *, *], where * represents a
|
||||
* padding value(preset in python).
|
||||
*/
|
||||
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
|
||||
int32_t expert_id = topk_ids[i];
|
||||
/** The cumsum[expert_id] stores the starting index of the tokens that the
|
||||
* expert with expert_id needs to process, and
|
||||
* tokens_cnts[threadIdx.x][expert_id] stores the indices of the tokens
|
||||
* processed by the expert with expert_id within the current thread's token
|
||||
* shard.
|
||||
*/
|
||||
int32_t rank_post_pad =
|
||||
tokens_cnts[index(num_experts, threadIdx.x, expert_id)] +
|
||||
cumsum[expert_id];
|
||||
sorted_token_ids[rank_post_pad] = i;
|
||||
++tokens_cnts[index(num_experts, threadIdx.x, expert_id)];
|
||||
}
|
||||
}
|
||||
|
||||
// taken from
|
||||
// https://github.com/sgl-project/sglang/commit/cdae77b03dfc6fec3863630550b45bbfc789f957
|
||||
template <typename scalar_t>
|
||||
__global__ void sgl_moe_align_block_size_kernel(
|
||||
scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids,
|
||||
int32_t* expert_ids, int32_t* total_tokens_post_pad, int32_t num_experts,
|
||||
int32_t block_size, size_t numel, int32_t* cumsum) {
|
||||
__shared__ int32_t shared_counts[32][8];
|
||||
|
||||
const int warp_id = threadIdx.x / 32;
|
||||
const int experts_per_warp = 8;
|
||||
const int warp_id = threadIdx.x / WARP_SIZE;
|
||||
const int my_expert_start = warp_id * experts_per_warp;
|
||||
|
||||
// Initialize shared_counts for this warp's experts
|
||||
for (int i = 0; i < experts_per_warp; ++i) {
|
||||
if (my_expert_start + i < num_experts) {
|
||||
shared_counts[warp_id][i] = 0;
|
||||
if (my_expert_start + i < padded_num_experts) {
|
||||
shared_counts[warp_id * experts_per_warp + i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
const size_t tokens_per_thread = CEILDIV(numel, blockDim.x);
|
||||
const size_t start_idx = threadIdx.x * tokens_per_thread;
|
||||
const size_t tid = threadIdx.x;
|
||||
const size_t stride = blockDim.x;
|
||||
|
||||
for (int i = start_idx; i < numel && i < start_idx + tokens_per_thread; ++i) {
|
||||
for (size_t i = tid; i < numel; i += stride) {
|
||||
int expert_id = topk_ids[i];
|
||||
int warp_idx = expert_id / experts_per_warp;
|
||||
int expert_offset = expert_id % experts_per_warp;
|
||||
atomicAdd(&shared_counts[warp_idx][expert_offset], 1);
|
||||
atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Single thread computes cumulative sum and total tokens
|
||||
if (threadIdx.x == 0) {
|
||||
cumsum[0] = 0;
|
||||
for (int i = 1; i <= num_experts; ++i) {
|
||||
int expert_count = 0;
|
||||
int warp_idx = (i - 1) / experts_per_warp;
|
||||
int expert_offset = (i - 1) % experts_per_warp;
|
||||
expert_count = shared_counts[warp_idx][expert_offset];
|
||||
expert_count = shared_counts[warp_idx * experts_per_warp + expert_offset];
|
||||
|
||||
cumsum[i] =
|
||||
cumsum[i - 1] + CEILDIV(expert_count, block_size) * block_size;
|
||||
@ -248,7 +61,6 @@ __global__ void sgl_moe_align_block_size_kernel(
|
||||
|
||||
__syncthreads();
|
||||
|
||||
// Assign expert IDs to blocks
|
||||
if (threadIdx.x < num_experts) {
|
||||
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
|
||||
i += block_size) {
|
||||
@ -257,13 +69,11 @@ __global__ void sgl_moe_align_block_size_kernel(
|
||||
}
|
||||
}
|
||||
|
||||
// taken from
|
||||
// https://github.com/sgl-project/sglang/commit/cdae77b03dfc6fec3863630550b45bbfc789f957
|
||||
template <typename scalar_t>
|
||||
__global__ void sgl_moe_token_sort_kernel(scalar_t* __restrict__ topk_ids,
|
||||
int32_t* sorted_token_ids,
|
||||
int32_t* cumsum_buffer,
|
||||
size_t numel) {
|
||||
__global__ void count_and_sort_expert_tokens_kernel(
|
||||
const scalar_t* __restrict__ topk_ids,
|
||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
|
||||
size_t numel) {
|
||||
const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
const size_t stride = blockDim.x * gridDim.x;
|
||||
|
||||
@ -290,132 +100,138 @@ __global__ void moe_sum_kernel(
|
||||
}
|
||||
}
|
||||
|
||||
template <typename scalar_t>
|
||||
__global__ void moe_align_block_size_small_batch_expert_kernel(
|
||||
const scalar_t* __restrict__ topk_ids,
|
||||
int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ expert_ids,
|
||||
int32_t* __restrict__ total_tokens_post_pad, int32_t num_experts,
|
||||
int32_t block_size, size_t numel) {
|
||||
const size_t tid = threadIdx.x;
|
||||
const size_t stride = blockDim.x;
|
||||
|
||||
extern __shared__ int32_t shared_mem[];
|
||||
int32_t* cumsum = shared_mem;
|
||||
int32_t* tokens_cnts = (int32_t*)(shared_mem + num_experts + 1);
|
||||
|
||||
for (int i = 0; i < num_experts; ++i) {
|
||||
tokens_cnts[(threadIdx.x + 1) * num_experts + i] = 0;
|
||||
}
|
||||
|
||||
for (size_t i = tid; i < numel; i += stride) {
|
||||
++tokens_cnts[(threadIdx.x + 1) * num_experts + topk_ids[i]];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x < num_experts) {
|
||||
tokens_cnts[threadIdx.x] = 0;
|
||||
for (int i = 1; i <= blockDim.x; ++i) {
|
||||
tokens_cnts[i * num_experts + threadIdx.x] +=
|
||||
tokens_cnts[(i - 1) * num_experts + threadIdx.x];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
cumsum[0] = 0;
|
||||
for (int i = 1; i <= num_experts; ++i) {
|
||||
cumsum[i] =
|
||||
cumsum[i - 1] +
|
||||
CEILDIV(tokens_cnts[blockDim.x * num_experts + i - 1], block_size) *
|
||||
block_size;
|
||||
}
|
||||
*total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
if (threadIdx.x < num_experts) {
|
||||
for (int i = cumsum[threadIdx.x]; i < cumsum[threadIdx.x + 1];
|
||||
i += block_size) {
|
||||
expert_ids[i / block_size] = threadIdx.x;
|
||||
}
|
||||
}
|
||||
|
||||
for (size_t i = tid; i < numel; i += stride) {
|
||||
int32_t expert_id = topk_ids[i];
|
||||
int32_t rank_post_pad =
|
||||
tokens_cnts[threadIdx.x * num_experts + expert_id] + cumsum[expert_id];
|
||||
sorted_token_ids[rank_post_pad] = i;
|
||||
++tokens_cnts[threadIdx.x * num_experts + expert_id];
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace moe
|
||||
} // namespace vllm
|
||||
|
||||
// taken from
|
||||
// https://github.com/sgl-project/sglang/blob/8b5f83ed3b7d2a49ad5c5cd5aa61c5d502f47dbc
|
||||
void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||
int64_t block_size, torch::Tensor sorted_token_ids,
|
||||
torch::Tensor experts_ids,
|
||||
torch::Tensor num_tokens_post_pad) {
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
|
||||
int device_max_shared_mem;
|
||||
auto dev = topk_ids.get_device();
|
||||
cudaDeviceGetAttribute(&device_max_shared_mem,
|
||||
cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
|
||||
|
||||
const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
|
||||
const int32_t shared_mem_i32 =
|
||||
((num_thread + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t);
|
||||
const int32_t shared_mem_i16 =
|
||||
((num_thread + 1) * num_experts) * sizeof(uint16_t) +
|
||||
(num_experts + 1) * sizeof(int32_t);
|
||||
|
||||
bool use_global_memory = false;
|
||||
bool use_i16 = false; // Use uint16_t for shared memory token counts
|
||||
if (shared_mem_i32 < device_max_shared_mem) {
|
||||
// Do nothing in this case. We're all set to use int32_t token counts
|
||||
} else if (shared_mem_i16 < device_max_shared_mem &&
|
||||
topk_ids.numel() <= 65535) {
|
||||
// when nelements of topk_ids is smaller than 65535 (max value of uint16),
|
||||
// element value of token_cnts would also smaller than 65535,
|
||||
// so we can use uint16 as dtype of token_cnts
|
||||
use_i16 = true;
|
||||
} else {
|
||||
use_global_memory = true;
|
||||
}
|
||||
|
||||
if (use_global_memory) {
|
||||
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
|
||||
topk_ids.scalar_type(), "moe_align_block_size_global_mem_kernel", [&] {
|
||||
// calc needed amount of shared mem for `tokens_cnts` and `cumsum`
|
||||
// tensors
|
||||
const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
|
||||
|
||||
auto options_int = torch::TensorOptions()
|
||||
.dtype(torch::kInt)
|
||||
.device(topk_ids.device());
|
||||
torch::Tensor token_cnts_buffer =
|
||||
torch::empty({(num_experts + 1) * num_experts}, options_int);
|
||||
torch::Tensor cumsum_buffer =
|
||||
torch::empty({num_experts + 1}, options_int);
|
||||
|
||||
auto kernel =
|
||||
vllm::moe::moe_align_block_size_global_mem_kernel<scalar_t>;
|
||||
kernel<<<1, num_thread, 0, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
sorted_token_ids.data_ptr<int32_t>(),
|
||||
experts_ids.data_ptr<int32_t>(),
|
||||
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
|
||||
topk_ids.numel(), token_cnts_buffer.data_ptr<int32_t>(),
|
||||
cumsum_buffer.data_ptr<int32_t>());
|
||||
});
|
||||
} else if (use_i16) {
|
||||
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
|
||||
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
|
||||
// set dynamic shared mem
|
||||
auto kernel =
|
||||
vllm::moe::moe_align_block_size_kernel<scalar_t, uint16_t>;
|
||||
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
|
||||
(void*)kernel, shared_mem_i16));
|
||||
kernel<<<1, num_thread, shared_mem_i16, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
sorted_token_ids.data_ptr<int32_t>(),
|
||||
experts_ids.data_ptr<int32_t>(),
|
||||
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
|
||||
topk_ids.numel());
|
||||
});
|
||||
} else {
|
||||
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
|
||||
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
|
||||
auto kernel =
|
||||
vllm::moe::moe_align_block_size_kernel<scalar_t, int32_t>;
|
||||
AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
|
||||
(void*)kernel, shared_mem_i32));
|
||||
kernel<<<1, num_thread, shared_mem_i32, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
sorted_token_ids.data_ptr<int32_t>(),
|
||||
experts_ids.data_ptr<int32_t>(),
|
||||
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
|
||||
topk_ids.numel());
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||
int64_t block_size,
|
||||
torch::Tensor sorted_token_ids,
|
||||
torch::Tensor experts_ids,
|
||||
torch::Tensor num_tokens_post_pad) {
|
||||
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
|
||||
TORCH_CHECK(num_experts == 256,
|
||||
"sgl_moe_align_block_size kernel only supports deepseek v3.");
|
||||
int64_t padded_num_experts =
|
||||
((num_experts + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
|
||||
int experts_per_warp = WARP_SIZE;
|
||||
int threads = 1024;
|
||||
threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
|
||||
|
||||
VLLM_DISPATCH_INTEGRAL_AND_UNSIGNED_TYPES(
|
||||
topk_ids.scalar_type(), "sgl_moe_align_block_size_kernel", [&] {
|
||||
topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
|
||||
// calc needed amount of shared mem for `cumsum` tensors
|
||||
auto options_int =
|
||||
torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
|
||||
torch::Tensor cumsum_buffer =
|
||||
torch::zeros({num_experts + 1}, options_int);
|
||||
bool small_batch_expert_mode =
|
||||
(topk_ids.numel() < 1024) && (num_experts <= 64);
|
||||
|
||||
auto align_kernel =
|
||||
vllm::moe::sgl_moe_align_block_size_kernel<scalar_t>;
|
||||
align_kernel<<<1, 1024, 0, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
|
||||
experts_ids.data_ptr<int32_t>(),
|
||||
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
|
||||
topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>());
|
||||
if (small_batch_expert_mode) {
|
||||
const int32_t threads = max((int32_t)num_experts, WARP_SIZE);
|
||||
const int32_t shared_mem_size =
|
||||
((threads + 1) * num_experts + (num_experts + 1)) *
|
||||
sizeof(int32_t);
|
||||
|
||||
const int block_threads = 256;
|
||||
const int num_blocks =
|
||||
(topk_ids.numel() + block_threads - 1) / block_threads;
|
||||
const int max_blocks = 65535;
|
||||
const int actual_blocks = std::min(num_blocks, max_blocks);
|
||||
auto sort_kernel = vllm::moe::sgl_moe_token_sort_kernel<scalar_t>;
|
||||
sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(), sorted_token_ids.data_ptr<int32_t>(),
|
||||
cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel());
|
||||
auto small_batch_expert_kernel =
|
||||
vllm::moe::moe_align_block_size_small_batch_expert_kernel<
|
||||
scalar_t>;
|
||||
small_batch_expert_kernel<<<1, threads, shared_mem_size, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
sorted_token_ids.data_ptr<int32_t>(),
|
||||
experts_ids.data_ptr<int32_t>(),
|
||||
num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
|
||||
topk_ids.numel());
|
||||
} else {
|
||||
auto align_kernel = vllm::moe::moe_align_block_size_kernel<scalar_t>;
|
||||
|
||||
size_t num_warps = CEILDIV(padded_num_experts, experts_per_warp);
|
||||
size_t shared_mem_size =
|
||||
num_warps * experts_per_warp * sizeof(int32_t);
|
||||
|
||||
align_kernel<<<1, threads, shared_mem_size, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
sorted_token_ids.data_ptr<int32_t>(),
|
||||
experts_ids.data_ptr<int32_t>(),
|
||||
num_tokens_post_pad.data_ptr<int32_t>(), num_experts,
|
||||
padded_num_experts, experts_per_warp, block_size,
|
||||
topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>());
|
||||
|
||||
const int block_threads = std::min(256, (int)threads);
|
||||
const int num_blocks =
|
||||
(topk_ids.numel() + block_threads - 1) / block_threads;
|
||||
const int max_blocks = 65535;
|
||||
const int actual_blocks = std::min(num_blocks, max_blocks);
|
||||
|
||||
auto sort_kernel =
|
||||
vllm::moe::count_and_sort_expert_tokens_kernel<scalar_t>;
|
||||
sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
|
||||
topk_ids.data_ptr<scalar_t>(),
|
||||
sorted_token_ids.data_ptr<int32_t>(),
|
||||
cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel());
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
@ -12,12 +12,6 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||
int64_t block_size, torch::Tensor sorted_token_ids,
|
||||
torch::Tensor experts_ids,
|
||||
torch::Tensor num_tokens_post_pad);
|
||||
|
||||
void sgl_moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
|
||||
int64_t block_size,
|
||||
torch::Tensor sorted_token_ids,
|
||||
torch::Tensor experts_ids,
|
||||
torch::Tensor num_tokens_post_pad);
|
||||
#ifndef USE_ROCM
|
||||
torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
|
||||
torch::Tensor b_qweight, torch::Tensor b_scales,
|
||||
@ -30,4 +24,8 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
|
||||
int64_t BLOCK_SIZE_K, int64_t bit);
|
||||
#endif
|
||||
|
||||
bool moe_permute_unpermute_supported();
|
||||
bool moe_permute_unpermute_supported();
|
||||
|
||||
void shuffle_rows(const torch::Tensor& input_tensor,
|
||||
const torch::Tensor& dst2src_map,
|
||||
torch::Tensor& output_tensor);
|
||||
@ -12,7 +12,7 @@ void moe_permute(
|
||||
const torch::Tensor& input, // [n_token, hidden]
|
||||
const torch::Tensor& topk_weights, //[n_token, topk]
|
||||
torch::Tensor& topk_ids, // [n_token, topk]
|
||||
const torch::Tensor& token_expert_indicies, // [n_token, topk]
|
||||
const torch::Tensor& token_expert_indices, // [n_token, topk]
|
||||
const std::optional<torch::Tensor>& expert_map, // [n_expert]
|
||||
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
||||
const std::optional<int64_t>& align_block_size,
|
||||
@ -27,15 +27,15 @@ void moe_permute(
|
||||
"expert_first_token_offset must be int64");
|
||||
TORCH_CHECK(topk_ids.scalar_type() == at::ScalarType::Int,
|
||||
"topk_ids must be int32");
|
||||
TORCH_CHECK(token_expert_indicies.scalar_type() == at::ScalarType::Int,
|
||||
"token_expert_indicies must be int32");
|
||||
TORCH_CHECK(token_expert_indices.scalar_type() == at::ScalarType::Int,
|
||||
"token_expert_indices must be int32");
|
||||
TORCH_CHECK(src_row_id2dst_row_id_map.scalar_type() == at::ScalarType::Int,
|
||||
"src_row_id2dst_row_id_map must be int32");
|
||||
TORCH_CHECK(expert_first_token_offset.size(0) == n_local_expert + 1,
|
||||
"expert_first_token_offset shape != n_local_expert+1")
|
||||
TORCH_CHECK(
|
||||
src_row_id2dst_row_id_map.sizes() == token_expert_indicies.sizes(),
|
||||
"token_expert_indicies shape must be same as src_row_id2dst_row_id_map");
|
||||
src_row_id2dst_row_id_map.sizes() == token_expert_indices.sizes(),
|
||||
"token_expert_indices shape must be same as src_row_id2dst_row_id_map");
|
||||
auto n_token = input.sizes()[0];
|
||||
auto n_hidden = input.sizes()[1];
|
||||
auto align_block_size_value =
|
||||
@ -71,7 +71,7 @@ void moe_permute(
|
||||
expert_map_ptr, n_expert, stream);
|
||||
}
|
||||
// expert sort topk expert id and scan expert id get expert_first_token_offset
|
||||
sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indicies),
|
||||
sortAndScanExpert(get_ptr<int>(topk_ids), get_ptr<int>(token_expert_indices),
|
||||
get_ptr<int>(permuted_experts_id),
|
||||
get_ptr<int>(dst_row_id2src_row_id_map),
|
||||
get_ptr<int64_t>(expert_first_token_offset), n_token,
|
||||
@ -130,11 +130,67 @@ void moe_unpermute(
|
||||
});
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void shuffleInputRowsKernel(const T* input,
|
||||
const int32_t* dst2src_map, T* output,
|
||||
int64_t num_src_rows,
|
||||
int64_t num_dst_rows, int64_t num_cols) {
|
||||
int64_t dest_row_idx = blockIdx.x;
|
||||
int64_t const source_row_idx = dst2src_map[dest_row_idx];
|
||||
|
||||
if (blockIdx.x < num_dst_rows) {
|
||||
// Load 128-bits per thread
|
||||
constexpr int64_t ELEM_PER_THREAD = 128 / sizeof(T) / 8;
|
||||
using DataElem = cutlass::Array<T, ELEM_PER_THREAD>;
|
||||
|
||||
// Duplicate and permute rows
|
||||
auto const* source_row_ptr =
|
||||
reinterpret_cast<DataElem const*>(input + source_row_idx * num_cols);
|
||||
auto* dest_row_ptr =
|
||||
reinterpret_cast<DataElem*>(output + dest_row_idx * num_cols);
|
||||
|
||||
int64_t const start_offset = threadIdx.x;
|
||||
int64_t const stride = blockDim.x;
|
||||
int64_t const num_elems_in_col = num_cols / ELEM_PER_THREAD;
|
||||
|
||||
for (int elem_index = start_offset; elem_index < num_elems_in_col;
|
||||
elem_index += stride) {
|
||||
dest_row_ptr[elem_index] = source_row_ptr[elem_index];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void shuffle_rows(const torch::Tensor& input_tensor,
|
||||
const torch::Tensor& dst2src_map,
|
||||
torch::Tensor& output_tensor) {
|
||||
TORCH_CHECK(input_tensor.scalar_type() == output_tensor.scalar_type(),
|
||||
"Input and output tensors must have the same data type");
|
||||
|
||||
auto stream = at::cuda::getCurrentCUDAStream().stream();
|
||||
int64_t const blocks = output_tensor.size(0);
|
||||
int64_t const threads = 256;
|
||||
int64_t const num_dest_rows = output_tensor.size(0);
|
||||
int64_t const num_src_rows = input_tensor.size(0);
|
||||
int64_t const num_cols = input_tensor.size(1);
|
||||
|
||||
TORCH_CHECK(!(num_cols % (128 / sizeof(input_tensor.scalar_type()) / 8)),
|
||||
"num_cols must be divisible by 128 / "
|
||||
"sizeof(input_tensor.scalar_type()) / 8");
|
||||
|
||||
MOE_DISPATCH(input_tensor.scalar_type(), [&] {
|
||||
shuffleInputRowsKernel<scalar_t><<<blocks, threads, 0, stream>>>(
|
||||
reinterpret_cast<scalar_t*>(input_tensor.data_ptr()),
|
||||
dst2src_map.data_ptr<int32_t>(),
|
||||
reinterpret_cast<scalar_t*>(output_tensor.data_ptr()), num_src_rows,
|
||||
num_dest_rows, num_cols);
|
||||
});
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
|
||||
torch::Tensor& topk_ids,
|
||||
const torch::Tensor& token_expert_indicies,
|
||||
const torch::Tensor& token_expert_indices,
|
||||
const std::optional<torch::Tensor>& expert_map,
|
||||
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
||||
const std::optional<int64_t>& align_block_size,
|
||||
@ -147,7 +203,7 @@ void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
|
||||
|
||||
void moe_unpermute(const torch::Tensor& input,
|
||||
const torch::Tensor& topk_weights, torch::Tensor& topk_ids,
|
||||
const torch::Tensor& token_expert_indicies,
|
||||
const torch::Tensor& token_expert_indices,
|
||||
const std::optional<torch::Tensor>& expert_map,
|
||||
int64_t n_expert, int64_t n_local_expert, int64_t topk,
|
||||
const std::optional<int64_t>& align_block_size,
|
||||
|
||||
@ -14,12 +14,13 @@
|
||||
__VA_ARGS__(); \
|
||||
break; \
|
||||
}
|
||||
#define MOE_DISPATCH_FLOAT_CASE(...) \
|
||||
MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
||||
MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
|
||||
MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
|
||||
MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \
|
||||
MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__)
|
||||
#define MOE_DISPATCH_FLOAT_CASE(...) \
|
||||
MOE_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
|
||||
MOE_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
|
||||
MOE_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
|
||||
MOE_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) \
|
||||
MOE_DISPATCH_CASE(at::ScalarType::Float8_e4m3fn, __VA_ARGS__) \
|
||||
MOE_DISPATCH_CASE(at::ScalarType::Byte, __VA_ARGS__)
|
||||
|
||||
#define MOE_DISPATCH(TYPE, ...) \
|
||||
MOE_SWITCH(TYPE, MOE_DISPATCH_FLOAT_CASE(__VA_ARGS__))
|
||||
@ -39,6 +40,11 @@ template <>
|
||||
struct ScalarType2CudaType<at::ScalarType::BFloat16> {
|
||||
using type = __nv_bfloat16;
|
||||
};
|
||||
// uint8 for packed fp4
|
||||
template <>
|
||||
struct ScalarType2CudaType<at::ScalarType::Byte> {
|
||||
using type = uint8_t;
|
||||
};
|
||||
|
||||
// #if __CUDA_ARCH__ >= 890
|
||||
// fp8
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user