Compare commits

..

769 Commits

Author SHA1 Message Date
37e3806132 [Bugfix] Make Gemma3 MM V0 only for now (#14971)
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-03-17 10:04:21 -07:00
c0efdd655b [Fix][Structured Output] using vocab_size to construct matcher (#14868)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-redhat@users.noreply.github.com>
2025-03-17 11:42:45 -04:00
aaaec52ad9 [Bugfix][Model] Mixtral: use unused head_dim config argument (#14961)
Signed-off-by: Quentin Torroba <quentin.torroba@mistral.ai>
2025-03-17 07:44:18 -07:00
e1eb45d397 [Bugfix] Fix precommit - line too long in pixtral.py (#14960)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-17 07:18:50 -07:00
89fca671fb [V1] Default MLA to V1 (#14921)
Signed-off-by: simon-mo <simon.mo@hey.com>
2025-03-17 06:54:40 -07:00
d20b0c139c Add patch merger (#14957) 2025-03-17 06:47:50 -07:00
166a168b0f [Doc] Fix misleading log during multi-modal profiling (#14955)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-17 06:14:32 -07:00
2bb0e1a799 [Bugfix][ROCm] running new process using spawn method for rocm in tests. (#14810)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-03-17 11:33:35 +00:00
6eaf1e5c52 [Misc] Add --seed option to offline multi-modal examples (#14934)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-17 03:00:17 -07:00
868a8c5b2c [Bugfix] Fix Ultravox on V1 (#14929)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-17 17:15:20 +08:00
b4ad56c1bd [V1][TPU] Apply the ragged paged attention kernel fix and remove the padding. (#14846)
Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
2025-03-17 01:48:28 -07:00
69698f257e fix minor miscalled method (#14327) 2025-03-17 01:47:58 -07:00
cd0cd85102 [MISC] More AMD unused var clean up (#14926)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-03-17 16:40:41 +08:00
0a74bfce9c setup.py: drop assumption about local main branch (#14692)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-03-17 01:37:42 -07:00
dd3b865854 [Doc] Add vLLM Beijing meetup slide (#14938)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-03-17 16:29:36 +08:00
9b87a579aa [Misc][XPU] Use None as device capacity for XPU (#14932)
Signed-off-by: yan ma <yan.ma@intel.com>
2025-03-17 01:22:14 -07:00
b539222d4e [V1] Remove input cache client (#14864)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2025-03-16 23:42:06 -07:00
8d6cf89526 [V1] [Spec Decode] Support random sampling for spec decode (#13933)
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-03-16 22:00:20 -07:00
583a9778e0 [Benchmark] Do not save detailed info to json by default (#14879)
Signed-off-by: simon-mo <simon.mo@hey.com>
2025-03-16 21:48:11 -07:00
a73e183e36 [Misc] Replace os environ to monkeypatch in test suite (#14516)
Signed-off-by: sibi <85477603+t-sibiraj@users.noreply.github.com>
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Aaron Pham <contact@aarnphm.xyz>
2025-03-16 20:35:57 -07:00
1e799b7ec1 [BugFix] Fix MLA + V1 + TP==1 causing reinitialization of cuda context (#14910) 2025-03-17 03:35:37 +00:00
7f6c5ee06c [V1][Minor] Add __repr__ to ConstantList (#14907)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-03-16 20:20:15 -07:00
faa0275730 [V1] Optimize the overhead of rewinding (#14905)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-03-16 20:19:30 -07:00
8a5a9b70d7 [CI/Build] Update defaults for test reproducibility (#14893)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-17 10:38:15 +08:00
bb3aeddfaf [CI] Nightly Tests (#14898)
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Signed-off-by: rshaw@neuralmagic.com <robertgshaw2@gmail.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
2025-03-17 02:06:43 +00:00
aecc780dba [V1] Enable Entrypoints Tests (#14903) 2025-03-16 17:56:16 -07:00
90df7f23aa [Doc] Add guidance for using ccache with pip install -e . in doc (#14901) 2025-03-16 23:10:04 +00:00
b9b5bdfc7d [Misc] Catching Ray Compiled Graph PP test failures for V1 (#14847) 2025-03-16 15:46:42 -07:00
31060b2757 [V1][BugFix] Detect interleaved sliding window attention (#14896)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-03-16 14:53:53 -07:00
fc1f67715d [BugFix][V1] Fix overhead related to bad_words sampling when not in use (#14894)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-03-16 14:53:34 -07:00
f6137adbcb Revert "[Bugfix] Limit profiling run sequence length by max_model_len (#14785) (#14892)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-16 09:13:46 -07:00
e53b1350f2 [Bugfix] Explicitly disable Phi-4-multimodal in V1 (#14889)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-16 09:05:40 -07:00
d30aa7e9e6 [Bugfix] Limit profiling run sequence length by max_model_len (#14785)
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
2025-03-16 07:44:19 -07:00
d1ad2a57af [V1] [Spec Decode] Fix ngram tests (#14878) 2025-03-16 00:29:22 -07:00
b82662d952 [BugFix] Fix torch distributed stateless PG backend init (#14870)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-03-15 20:26:19 -07:00
71c1e07107 [Kernel] Add more tuned configs (#14877)
Signed-off-by: simon-mo <simon.mo@hey.com>
2025-03-15 20:25:03 -07:00
b30c75dda4 [V1] Remove V0 fallback for mistral-tokenizer (#14873)
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-03-15 20:21:11 -07:00
def232e122 [VLM] Clean up Phi-4-MM ViT implementation (#14812)
Signed-off-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-03-15 18:53:52 -07:00
3453b964a3 [Misc][Doc] Minor benchmark README update (#14874)
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-03-16 09:46:17 +08:00
61c6a5a796 [VLM] Merged multi-modal processor for Pixtral (#12211)
Signed-off-by: remi <remi@mistral.ai>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-15 06:28:27 -07:00
74bc397b0a [Core] Expose API endpoint /is_sleeping (#14312)
Signed-off-by: Jun Duan <jun.duan.phd@outlook.com>
2025-03-15 06:28:14 -07:00
f58aea002c [CI][Intel GPU] refine intel GPU ci docker build (#14860)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-03-15 11:58:53 +00:00
3556a41434 [VLM] Limit multimodal input cache by memory (#14805)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-15 02:52:05 -07:00
9ed6ee92d6 [Bugfix] EAGLE output norm bug (#14464)
Signed-off-by: Bryan Lu <yuzhelu@amazon.com>
2025-03-15 06:50:33 +00:00
ee3778d5fc [Build/CI] Upgrade jinja2 to get 3 moderate CVE fixes (#14839)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-03-15 05:38:19 +00:00
aaacf17324 [Doc] V1 user guide (#13991)
Signed-off-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Co-authored-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com>
Co-authored-by: Jennifer Zhao <JenZhao@users.noreply.github.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-03-14 22:17:59 -07:00
4c7629cae9 [V1][Structured Output] calculate vocab_size eagerly (#14851)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-03-14 22:09:51 -07:00
e0fdfa1608 [CI/Build] Delete LoRA bias test (#14849)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-03-14 22:09:25 -07:00
5952d8ab61 [Attention] Get rid of mla cache alignment (#14842)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-03-15 05:08:25 +00:00
a2ae496589 [CPU] Support FP8 KV cache (#14741)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-03-14 22:07:36 -07:00
877e352262 [Docs] Add new East Coast vLLM Meetup slides to README and meetups.md (#14852) 2025-03-14 22:06:38 -07:00
d4d93db2c5 [V1] V1 Enablement Oracle (#13726)
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Michael Goin <michael@neuralmagic.com>
2025-03-14 22:02:20 -07:00
8c0d15d5c5 [Misc][Easy] Annotate unused vars in the csrc files (#14798)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-03-15 12:40:09 +08:00
97ac781c62 [Misc] Remove misleading message in gemma2 and gemma3 (#14850)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-03-14 21:35:12 -07:00
776dcec8fe Disable outlines cache by default (#14837) 2025-03-15 03:57:55 +00:00
ccf02fcbae Revert "[Model] Mamba2 Prefill Performance Tweaks: Fixing Flurry of U… (#14848) 2025-03-14 20:45:42 -07:00
acaea3bb07 [Bugfix][V1] Fix flashinfer sampling (#14815) 2025-03-14 20:42:38 -07:00
9f37422779 [Neuron][CI] update docker run command (#14829)
Signed-off-by: Liangfu Chen <liangfc@amazon.com>
2025-03-14 18:51:35 -07:00
dd344e0342 [Bugfix] Fix torch_xla in V0 which can't handle None seed introduced … (#14844)
Signed-off-by: Yarong Mu <ymu@google.com>
2025-03-15 00:41:15 +00:00
54a8804455 [Doc] More neutral K8s deployment guide (#14084)
Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
2025-03-14 16:12:36 -07:00
bbd94a19fc [Build/CI] Upgrade aiohttp to incldue CVE fix (#14840)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-03-14 23:11:28 +00:00
233ffce1eb [Build/CI] Move ninja to common deps (#14835)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-03-14 21:25:28 +00:00
40677783aa [CI] Add TPU v1 test (#14834)
Signed-off-by: Richard Liu <ricliu@google.com>
2025-03-14 17:13:30 -04:00
14f301b541 Update to torch==2.6.0 (#12721)
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: luka <luka@neuralmagic.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-03-14 16:58:30 -04:00
46f98893dd [V1] Fix model parameterization for structured output tests (#14833)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-03-14 20:55:18 +00:00
fe66b34728 [Model] Mamba2 Prefill Performance Tweaks: Fixing Flurry of Unnecessary Memory Copies (#14778)
Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
2025-03-14 16:36:18 -04:00
270a5da495 Re-enable the AMD Entrypoints Test (#14711)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
2025-03-14 12:18:13 -07:00
7097b4cc1c [release] Remove log cleanup commands from TPU job (#14838) 2025-03-14 11:59:52 -07:00
977a16772c [Bugfix][Kernel]: Fix AllSpark kernel compilation errors and enable for CUDA < 12.0 (#14430)
Signed-off-by: wyj371990 <wyj371990@alibaba-inc.com>
2025-03-14 09:55:14 -07:00
73deea2fdb [Frontend] track server_load (#13950) 2025-03-14 09:53:17 -07:00
9d2b4a70f4 [V1][Metrics] Updated list of deprecated metrics in v0.8 (#14695)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-03-15 00:45:25 +08:00
0b0d6421b2 [Frontend] Fix log message to use http vs https (#14774)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-03-14 09:21:09 -07:00
1140991a7b [V1] Fix vocab size calculation for structured output (#14826)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-03-14 09:18:38 -07:00
613c5bb945 [Bugfix] Fix Aria test loading (#14823)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-14 09:11:23 -07:00
fd8e055ffb [BugFix]: properly catch templating error when preprocess input (#13976)
Signed-off-by: Guillaume Calmettes <gcalmettes@scaleway.com>
2025-03-14 05:58:34 -07:00
ab93f1360f [VLM] Various cleanup and fixes (#14806)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-14 05:58:19 -07:00
40253bab44 [Bugfix][W8A8] fixed cutlass block fp8 binding (#14796) 2025-03-14 03:32:42 -07:00
c77620d22d [V1][Minor] Minor code cleanup for scheduling metrics (#14800)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-03-14 08:21:28 +00:00
989ecd2007 [Misc] Gemma3ForConditionalGeneration supports LoRA (#14797)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-03-14 01:07:30 -07:00
54cc46f3eb [Bugfix] Fix small typo in the example of Streaming delimiter (#14793) 2025-03-14 08:05:17 +00:00
601bd3268e [Misc] Clean up type annotation for SupportsMultiModal (#14794)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-14 00:59:56 -07:00
09269b3127 [BugFix]Fix performance serving benchmark when enable profiling (#14737)
Signed-off-by: wangli <wangli858794774@gmail.com>
2025-03-14 07:02:05 +00:00
27b50f1fe6 [Bugfix][Kernel][CPU] Fix num_tokens in CPU rotary embedding kernel (#14667)
Signed-off-by: Thien Tran <gau.nernst@yahoo.com.sg>
2025-03-13 23:47:49 -07:00
9532c49836 [Attention] MLA get rid of materialization (#14770)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-03-13 23:39:02 -07:00
0c2af17c76 [CI] Fix missing example model id in processor test (#14787)
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-03-14 13:52:15 +08:00
a6e0d096dd [Feature] Add visionarena offline support for benchmark_throughput (#14654)
Signed-off-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com>
Signed-off-by: Jennifer Zhao <ai.jenniferzhao@gmail.com>
Co-authored-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com>
Co-authored-by: Jennifer Zhao <JenZhao@users.noreply.github.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2025-03-14 04:07:54 +00:00
d3d4956261 [Neuron] flatten test parameterization for neuron attention kernels (#14712) 2025-03-13 20:46:56 -07:00
4059adc31b [Misc][Minor] Simplify SamplingParams.__post_init__() (#14772)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-03-14 11:44:20 +08:00
f1f632d9ec [ci] Reduce number of tests in fastcheck (#14782) 2025-03-13 20:43:45 -07:00
95d680b862 [Bugfix][IPEX] Add VLLM_CPU_MOE_PREPACK to allow disabling MoE prepack when CPU does not support it (#14681)
Signed-off-by: Thien Tran <gau.nernst@yahoo.com.sg>
2025-03-13 20:43:18 -07:00
fb4c7f8ef0 [Kernel] [V1] Further optimizations to ROCm (Triton) Backend to better handle GQA. (#14431)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Jan van Lunteren <jvl@zurich.ibm.com>
Co-authored-by: Burkhard Ringlein <ngl@zurich.ibm.com>
Co-authored-by: Chih-Chieh Yang <chih.chieh.yang@ibm.com>
2025-03-13 20:42:27 -07:00
0b1cfa6180 [Kernel] LoRA - Enable CUDAGraphs for V1 (#14626)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2025-03-13 20:42:04 -07:00
32ef4983cd [V1] Temporarily disable FlashInfer Rejection Sampler (#14788)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-03-13 20:40:35 -07:00
ad19c8a003 [V1] Move OOM check into sampler run (#14728)
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Simon Mo <simon.mo@hey.com>
2025-03-13 20:40:23 -07:00
2a602b055a forward fix PR 14245, restore build on ROCm 6.2 (#14709)
Signed-off-by: Jeff Daily <jeff.daily@amd.com>
2025-03-13 20:40:15 -07:00
7888e1d0a3 [V1] TPU - Enable prefix caching by default (#14773) 2025-03-13 20:40:05 -07:00
60c872d4b6 [Doc] Fix small typo in Transformers fallback (#14791)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-03-13 20:33:12 -07:00
3fb17d26c8 [Doc] Fix typo in documentation (#14783)
Signed-off-by: yasu52 <tsuguro4649@gmail.com>
2025-03-13 20:33:09 -07:00
d47807ba08 [Attention] Remove slow setattr in MLA (#14769)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-03-13 21:31:14 +00:00
02fcaa3d0a [V1] Detokenizer: Respect Stop Tokens + not include_stop_str_in_output (#14624)
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
2025-03-13 19:07:34 +00:00
8a4a2efc6f [V1][Core] using cached vocab_size for Structured Outputs (#14630)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-03-13 11:39:28 -07:00
8e9ffd37d6 [Misc] Clean up processor tests (#14771)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-13 18:25:37 +00:00
01b3fd0af7 [V1][Minor] Minor enhancements on scheduler (#14732)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-03-13 08:53:22 -07:00
f53a0586b9 [Bugfix] Fix prompt format of GLM4V (#14539)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-13 11:37:17 +00:00
b1cc4dfef5 [VLM] Support loading InternVideo2.5 models as original InternVLChatModel (#14738)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-03-13 03:10:02 -07:00
382403921f [VLM] Support pan-and-scan for Gemma3 multi-modal processor (#14672)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Roger Wang <ywang@roblox.com>
2025-03-13 02:23:12 -07:00
a73122de96 [Bugfix] fix benchmark moe (#14653)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-03-13 16:12:42 +08:00
bd44b812cb [CI/Build] Delete ultravox LoRA test (#14730)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-03-13 07:57:39 +00:00
55211b01e8 [Bugfix] Fix chunked prefill for GGUF (#14666)
Signed-off-by: SzymonOzog <szymon.ozog@aleph-alpha.com>
2025-03-13 07:19:03 +00:00
5d043c1685 [Quant] Bamba SupportsQuant (#14698)
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
2025-03-13 04:57:05 +00:00
36d1ccb286 [Quant] BartModel SupportsQuant (#14699)
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
2025-03-13 04:55:59 +00:00
1bc3b739c4 [V1][TPU] Add assertion on multi-step-scheduler (#14707)
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
2025-03-12 21:37:58 -07:00
1bd32bc8dd [Config][Disaggregated] Add timeout configuration for the torch.store and add KVTransferConfig.kv_connector_extra_config (#14367)
Signed-off-by: Mathis Felardos <mathis@mistral.ai>
2025-03-12 20:15:20 -07:00
128bf75283 [BugFix][TritonMLA] Process weights after model loading for GGUF (#14555)
Signed-off-by: TianyuanWu <Tianyuan.Wu@amd.com>
2025-03-12 20:14:36 -07:00
a94a699c3f [ROCm][FP8] Fix for adjustments needed only for fnuz (#14689)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-03-12 20:14:04 -07:00
ab426ec9c0 Add ray[data] as tpu dependency (#14691)
Signed-off-by: <ricliu@google.com>
Signed-off-by: Richard Liu <ricliu@google.com>
2025-03-12 20:13:48 -07:00
165290d357 [bugfix] fixup warning message for plugged schedulers for v1 (#14700)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
2025-03-12 20:12:13 -07:00
ce20124671 [release] Add force remove for TPU logs (#14697) 2025-03-12 22:35:18 +00:00
53be4a8634 [V1] Allow sliding window + prefix caching (#13069)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-03-12 11:21:19 -07:00
f5d3acd474 [BugFix][V1] Fix parallel sampling finishing/aborts (#14512)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-03-12 10:29:48 -07:00
916836bbfb [FEAT] [ROCm] [Embedding] Add encoder-only model support into ROCm Flash Attention to enable embedding models. (#14664)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-03-12 09:31:19 -07:00
d9f83d6206 [ROCm] Enable chunked prefill/paged attention in MLA on ROCm (#14316)
Signed-off-by: Sage Moore <sage@neuralmagic.com>
2025-03-12 15:51:20 +00:00
4a754fcf15 [Bugfix] Missing thumbnail from NVLM-D processor (#14633)
Signed-off-by: ameyanjarlekar <aanjarlekar@nvidia.com>
2025-03-12 08:50:49 -07:00
c0c25e25fa [Model] Add support for Gemma 3 (#14660)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Roger Wang <ywang@roblox.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-12 08:36:33 -07:00
45f3f3f59e [ROCm][Bugfix] Ensure that the moe_wna16_gemm kernel is not built on ROCm platforms. (#14629)
Signed-off-by: Sage Moore <sage@neuralmagic.com>
2025-03-12 08:00:28 -04:00
ff47aab056 [CPU] Upgrade CPU backend to torch-2.6 (#13381)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
2025-03-12 10:41:13 +00:00
debd6bbf09 [Kernel] Add ModelOpt FP4 Checkpoint Support (#12520)
Signed-off-by: Pavani Majety <pmajety@nvidia.com>
2025-03-12 05:13:11 +00:00
5c538c37b2 [V1][Bugfix][Spec Decode] Fix incorrect outputs in V1 speculative decoding due to batch indexing (#14645)
Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
2025-03-11 22:12:41 -07:00
e22ee1e7a2 [Kernel] GGUF MoE kernel (#14613)
Signed-off-by: SzymonOzog <szymon.ozog@aleph-alpha.com>
2025-03-12 03:33:27 +00:00
e392d85831 [Core] Refactor QKVCrossParallelLinear implementation to support BNB 4-bit quantization (#14545)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-03-11 20:12:52 -07:00
77a318bd01 [V1][Core] Support MistralTokenizer for Structured Output (#14625)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-03-12 10:40:09 +08:00
80e78d02ac [Model] Extend Ultravox to accept audio longer than 30s (#13631)
Signed-off-by: Farzad Abdolhosseini <farzad@fixie.ai>
2025-03-12 10:27:10 +08:00
4a42b9f5d6 [Doc] Update benchmarks README (#14646)
Signed-off-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com>
Co-authored-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2025-03-11 19:23:04 -07:00
47532cd9f4 [core][V1] pluggable scheduler (#14466)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
2025-03-12 01:15:15 +00:00
36e0c8f7da [Feature] Add vllm bench CLI (#13993)
Signed-off-by: Randy Chen <acad.randyjhc@gmail.com>
Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
2025-03-12 00:31:48 +00:00
9f583e360c [release] Add commands to clean up logs on TPU release node (#14642) 2025-03-12 00:14:50 +00:00
b706d898af [Bugfix][V1][PP] Only warmup sampler at last PP rank (#14643)
Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
2025-03-11 23:40:07 +00:00
863d315c86 [V1][TPU] Pad the block_table.shape[1] so the ragged paged attention can handle correctly (#14597) 2025-03-11 19:12:26 -04:00
d374f04a33 Fix run_tpu_test (#14641)
Signed-off-by: <ricliu@google.com>
Signed-off-by: Richard Liu <ricliu@google.com>
2025-03-11 21:14:33 +00:00
61a01b27a7 [V1] Delay all xgrammar usage until needed (#14616)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-03-11 20:21:33 +00:00
53056731fd fix some typos : supported_head_sizes (#14627) 2025-03-11 10:38:24 -07:00
4cbf286794 [V1] Remove cache from StructuredOutputManager (#14622)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-03-11 10:36:07 -07:00
c6e14a61ab [Hardware][Intel GPU] upgrade IPEX dependency to 2.6.10. (#14564)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-03-11 17:11:47 +00:00
07b4b7a37f [BugFix/Build] Fix sparse kernels not getting built on hopper (#14572)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-03-11 17:09:03 +00:00
07964e2f30 docs: Add documentation for s390x cpu implementation (#14198)
Signed-off-by: Dilip Gowda Bhagavan <dilip.bhagavan@ibm.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-03-11 17:02:17 +00:00
4bf82d4b90 [V1] Add regex structured output support with xgrammar (#14590)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-03-11 23:03:44 +08:00
9ab326713f Uninstall dependencies before installing requirements/tpu.txt (#14586)
Signed-off-by: <ricliu@google.com>
Signed-off-by: Richard Liu <ricliu@google.com>
2025-03-11 08:01:35 -07:00
af295e9b01 [Bugfix] Update --hf-overrides for Alibaba-NLP/gte-Qwen2 (#14609)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-11 07:59:43 -07:00
a1c8f3796c dynamic distpatch of fp8 kernels (#14245)
Signed-off-by: Jeff Daily <jeff.daily@amd.com>
2025-03-11 10:54:56 -04:00
08a1a1121d benchmarks: simplify test jsonschema (#14567)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-03-11 13:39:30 +00:00
1477ffc381 [VLM] Cleanup siglip legacy code and fix broken paligemma multimodal processor (#14602)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-03-11 11:27:36 +00:00
70b808fe1a [Perf]:Optimize qwen2-vl to reduce cudaMemcpyAsync (#14377)
Signed-off-by: cynthieye <987073381@qq.com>
2025-03-11 07:39:56 +00:00
63d635d179 [Misc] Correct deepseek-vl2 chat template (#14558)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-03-11 04:37:11 +00:00
1fc973c0b5 [V1][Core] Fix memory issue with logits & sampling (#14508)
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Varun Sundar Rabindranath <3337719+varun-sundar-rabindranath@users.noreply.github.com>
2025-03-11 04:03:41 +00:00
c982ac5722 [Bugfix] Fix FP16 overflow for DeepSeek V2 (#13232)
Signed-off-by: Yida Wu <yida.wu@amd.com>
2025-03-10 20:46:59 -07:00
4290b704ff [V1][PP] Do not block engine core when no requests to schedule (#14585)
Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
2025-03-10 19:48:24 -07:00
c91b64f749 [neuron] add reshape_and_cache (#14391) 2025-03-10 18:37:29 -07:00
d6123170d5 [Neuron] Add Neuron device communicator for vLLM v1 (#14085) 2025-03-10 18:37:04 -07:00
485afdd3cb [MISC][V1] Handle exception of current_platform.get_device_name() in arg_utils (#14379)
Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
2025-03-10 20:42:11 -04:00
90e88ab756 [Kernel] moe wna16 cuda kernel (#13321)
Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-03-10 20:12:40 -04:00
04421dff8a [V1] Prevent xgrammar from breaking TPU support (#14575)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-03-10 23:06:19 +00:00
432d6dad15 Fix typo in benchmark_serving_structured_output.py (#14566)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-03-10 14:58:58 -07:00
5ff0d32580 [V1] LoRA - Add triton kernels for V1 (#13096)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2025-03-10 17:27:53 -04:00
0967110e42 [Minor] Update the tqdm bar for parallel sampling (#14571)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-03-10 14:23:48 -07:00
fb0acb6c72 [Perf] Improve MLA on V1 (#14540)
Signed-off-by: simon-mo <simon.mo@hey.com>
2025-03-10 12:06:58 -07:00
92b0ce2ac7 [Bugfix][v1] fixed llava-hf/llava-1.5-7b-hf is broken on V1 (#14554)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-10 18:24:51 +00:00
bc2d4473bf [Docs] Make installation URLs nicer (#14556)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-03-10 10:43:08 -07:00
3b352a2f92 Correct capitalisation: VLLM -> vLLM (#14562)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-03-10 16:36:21 +00:00
dea985aef0 [V1][Bugfix] Fix handing of second_per_grid_ts for Qwen2-VL & Qwen2.5-VL (#14548)
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-03-10 16:03:11 +00:00
39be30351f Correct capitalisation: Github -> GitHub (#14561)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-03-10 15:53:33 +00:00
001a9c7b0d [Doc] Update PaliGemma note to a warning (#14565)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-10 15:02:28 +00:00
89cdaa83e7 [Kernel] Add more dtype support for GGUF kernels (#14043)
Signed-off-by: SzymonOzog <szymon.ozog@aleph-alpha.com>
Signed-off-by: SzymonOzog <szymon.ozog@gmail.com>
2025-03-10 07:30:04 -07:00
b0746fae3d [Frontend] support image embeds (#13955)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-03-10 12:36:03 +00:00
60a98b2de5 [Docs] Mention model_impl arg when explaining Transformers fallback (#14552)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-03-10 12:13:10 +00:00
460f553a6d [Misc] Add log information for handle_process_request. (#14130)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-03-10 08:40:50 +00:00
1253b15774 [Feature] Consolidate performance benchmark datasets (#14036)
Signed-off-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com>
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2025-03-10 07:23:11 +00:00
dc74613fa2 [Bugfix] Wrong requirements path - rocm (#14527)
Signed-off-by: Martin Hoyer <mhoyer@redhat.com>
2025-03-10 02:49:46 +00:00
a21076ed3a [Misc] Ensure out-of-tree quantization method recognize by cli args (#14328)
Signed-off-by: liuyanyi <wolfsonliu@163.com>
2025-03-09 12:13:31 +00:00
212007b168 [Hardware][TPU] Fix the recompiling issue in logits processor after warmup (#14510)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-03-09 05:44:39 -04:00
fb16eea48b [Bugfix] Revert QKVCrossParallelLinear usage in Mllama to keep BNB quantization work (#14498)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-03-09 04:47:45 +00:00
73ae0b44e9 [Bugfix] Fix tqdm progress bar when SamplingParams.n > 1 (#12428)
Signed-off-by: Yuchen Yan <740987012@qq.com>
2025-03-08 20:14:53 -08:00
6d7f037748 [Feat] Support chunked prefill for LMCache connector (#14505)
Signed-off-by: YaoJiayi <120040070@link.cuhk.edu.cn>
2025-03-08 19:30:06 -08:00
10f7552789 [V1][TPU] Remove unnecessary padding for running on TPU. (#14467) 2025-03-08 21:56:04 -05:00
b0d541947a [Attention] Default to FlashMLA backend for MLA (#14451)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-03-08 18:18:39 -08:00
5f0b53c6ea Revert "[V1][Core] Fix memory issue with logits & sampling" (#14504)
Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2025-03-08 17:43:37 -08:00
eb8b5eb183 [V1] Support bad_words in sampler (#13376)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-03-08 14:50:26 -08:00
9513290032 [Misc] Upgrade to Python 3.9 typing for additional directories (#14492)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-08 17:35:50 +00:00
0d5e73d30e Update CODEOWNERS for structured output (#14496)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-03-08 17:19:51 +00:00
609ef61fea [Bugfix] Fix profiling OOM and decouple encoder multimodal profiling (#14361)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-03-08 16:52:34 +00:00
db84f5eb3b [Bugfix] DeepSeek Accuracy (#14476)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-03-08 16:47:03 +00:00
206e2577fa Move requirements into their own directory (#12547)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-03-08 16:44:35 +00:00
e02883c400 [Misc] Don't run ruff at all on 3rd party libs (#14493)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-08 07:16:40 -08:00
9085aabd62 [benchmarks] Add option to use unique jsonschema for each request (#14457)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-03-08 06:36:39 -08:00
8d5aa466fb [V1][Core] Fix memory issue with logits & sampling (#13776)
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-03-08 06:11:04 -08:00
0b7f06b447 [Misc] add use_tqdm_on_load to reduce logs (#14407)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
2025-03-08 05:57:46 -08:00
03fe18ae0f [VLM] Add TP support for Phi-4-MM (#14453)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-03-08 05:57:14 -08:00
cb8bdfade2 [V1] TPU - Add tensor parallel support via Ray (#13618)
Signed-off-by: Alexander Matveev <amatveev@redhat.com>
2025-03-08 08:19:38 -05:00
33f227e16b [CI/Build] Use a fixed seed to avoid flaky tests (#14480)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-08 11:30:09 +00:00
cfd0ae8234 Add RLHF document (#14482)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-03-08 09:51:39 +00:00
7caff01a7b [Build/BugFix] Fix hopper 12.8 build (#14354)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-03-08 08:11:56 +00:00
be0b399d74 Add training doc signposting to TRL (#14439)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-03-08 07:35:07 +00:00
b8b0ccbd2d [Bugfix] Make the deviceprofiler include LoRA memory. (#14469)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-03-08 07:12:22 +00:00
c908a07f57 [Doc] Added QwQ-32B to the supported models list in the reasoning out… (#14479)
Signed-off-by: WangErXiao <863579016@qq.com>
2025-03-08 07:07:32 +00:00
7b6fd6e486 [Doc]add doc for Qwen models tool calling (#14478)
Signed-off-by: WangErXiao <863579016@qq.com>
2025-03-08 06:58:46 +00:00
47512b3200 Default to generation_config from model (#12622)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-03-08 14:46:15 +08:00
3b9c6c6947 [CI/Build] refactor: set timezone of container to UTC (#12888)
Signed-off-by: Roger Meier <r.meier@siemens.com>
2025-03-07 22:42:01 -08:00
4aae667668 [core] add extra_args to SamplingParams (#13300)
Signed-off-by: Aviv Keshet <akeshet@scaledcognition.com>
2025-03-08 14:41:18 +08:00
9f3bc0f58c [MISC][V1] Register process killing handler only in the main thread (#14380)
Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
2025-03-07 22:40:06 -08:00
980385f8c1 [Bugfix][Disaggregated] Add a check in send_kv_caches_and_hidden_states and fix the reshape of the KVCache (#14369)
Signed-off-by: Mathis Felardos <mathis@mistral.ai>
2025-03-07 22:39:31 -08:00
ca7a2d5f28 Revert "[Perf] Reduce MLA CPU overheads in V1 (#14384)" (#14471) 2025-03-07 22:18:53 -08:00
333681408f [Bugfix][V1] Handle MLA in kv_cache_interface (#14462)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-03-07 22:18:25 -08:00
ef64044079 [V1] Prompt logprobs + APC compatibility; prompt logprobs reqs cannot fill APC (#13949) 2025-03-08 01:48:12 +00:00
66e16a038e [Bugfix] Fix torch_xla which can't handle None seed introduced in #14274 (#14459)
Signed-off-by: Yarong Mu <ymu@google.com>
2025-03-07 23:17:04 +00:00
e1f0835ae0 [V1][Metrics] Fix traceback with preemptions+LoRA (#14220)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-03-07 15:36:16 -05:00
8ed5421aaa [V1] Eagerly remove finished requests from the batch (#14388)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-03-07 10:56:00 -08:00
c6359e8ca6 [v1] torch.compile integration explanation (#14437)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-03-08 01:55:50 +08:00
952a074980 [Misc] Add Phi4-MM example (#14343)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-03-07 17:28:52 +00:00
d0feea31c7 [Kernel] optimize performance of gptq marlin kernel when n is small (#14138)
Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
2025-03-07 11:53:38 -05:00
58abe35455 [Benchmarks] Make detokenization optional in benchmark scripts (#11697)
Signed-off-by: Jeremy Arnold <Jeremy.Arnold@amd.com>
2025-03-07 08:09:00 -08:00
f7ebad2307 [Doc] Update prefix_caching.md to match the example image (#14420) 2025-03-07 15:29:00 +00:00
80e9afb5bc [V1][Core] Support for Structured Outputs (#12388)
Signed-off-by: Aaron Pham <contact@aarnphm.xyz>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-03-07 07:19:11 -08:00
1e3598edeb Use the optimized block sizes after tuning the kernel. (#14329) 2025-03-07 13:25:13 +00:00
f7a6bd0fa1 Fix missing kv_caches and attn_metadata in OpenVINOCausalLM (#14271)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-03-07 12:30:42 +00:00
0ca3b8e01c [BUGFIX] Skip tokenization support for throughput benchmark (#12712)
Signed-off-by: root <root@banff-cyxtera-s73-5.ctr.dcgpu>
Signed-off-by: Aleksandr Malyshev <maleksan@amd.com>
Co-authored-by: root <root@banff-cyxtera-s73-5.ctr.dcgpu>
Co-authored-by: Aleksandr Malyshev <maleksan@amd.com>
2025-03-07 02:51:47 -08:00
cc10281498 [Misc] Set default value of seed to None (#14274)
Signed-off-by: மனோஜ்குமார் பழனிச்சாமி <smartmanoj42857@gmail.com>
2025-03-07 10:40:01 +00:00
05fb6718f0 [Bugfix] Clean up multi-modal processors (#14417)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-07 10:33:38 +00:00
12c29a881f [Bugfix] Further clean up LoRA test (#14422)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-03-07 10:30:55 +00:00
70da0c0748 correct wrong markdown syntax (#14414)
Signed-off-by: vincent-pli <justdoit.pli@gmail.com>
2025-03-07 08:01:18 +00:00
c1588a2c94 [GH] Auto-apply multi-modality label to relevant PRs (#14402)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-07 15:26:32 +08:00
8ca7a71df7 OpenVINO: added CPU-like conditions (#14338)
Signed-off-by: Ilya Lavrenov <ilya.lavrenov@intel.com>
2025-03-06 22:24:49 -08:00
63137cd922 [Build] Add nightly wheel fallback when latest commit wheel unavailable (#14358)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-03-06 22:10:57 -08:00
ddd1ef66ec [Bugfix] Fix JambaForCausalLM LoRA (#14370)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-03-06 22:05:47 -08:00
e5e03c2c1b [BugFix] Illegal Memory Access in the blockwise cutlass fp8 GEMMs (#14396) 2025-03-06 21:56:06 -08:00
e1744502c2 [FP8] Refactor apply_fp8_linear and apply_fp8_linear_generic into an object (#14390)
Signed-off-by: luka <luka@neuralmagic.com>
2025-03-07 05:20:16 +00:00
dae6896977 [Perf] Reduce MLA CPU overheads in V1 (#14384)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-03-06 19:59:14 -08:00
c34eeec58d [Bugfix] Correctly call cudaProfilerStop in benchmarks script (#14183)
Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
2025-03-07 00:42:49 +00:00
ad60bbb2b2 [Doc] Fix a typo (#14385) 2025-03-06 16:31:52 -08:00
0578e5a462 [Hardware][TPU]Enable ragged paged attention kernel and resolve recompilation issue (#14310)
Signed-off-by: Chengji Yao <chengjiyao@google.com>
2025-03-06 23:31:05 +00:00
04222984f8 [Docs] Add nsight guide to profiling docs (#14298)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-03-06 14:19:58 -08:00
6832707e90 [V1][Bugfix] Standardize quantized kv cache rejection for attention backends (#14221)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-03-06 14:18:29 -08:00
6b2ef5cd17 [Bug] Fix Attention when ignored in by quant_method (#14313)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-03-06 14:18:06 -08:00
958adce478 [Bugfix] Fix use_direct_call condition in FusedMoE layer for (#14382)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-03-06 14:17:21 -08:00
99b0915d3b [Kernel] Add needs_fixed_stride_order tag to most GEMMs (#14306)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-03-06 14:17:09 -08:00
8ca2b21c98 [CI] Disable spawn when running V1 Test (#14345)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-03-06 21:52:46 +00:00
d9292786e1 [CI/Build] Use uv python for docker rather than ppa:deadsnakes/ppa (#13569)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-03-06 16:08:36 -05:00
cc2f9b32c8 [Distributed] Add enable_expert_parallel arg (#14305)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-03-06 18:54:45 +00:00
cd579352bf [V1] Do not detokenize if sampling param detokenize is False (#14224)
Signed-off-by: Himanshu Jaju <hj@mistral.ai>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-03-06 10:40:24 -08:00
9f1710f1ac Fix mla prefill context performance (#13897)
Signed-off-by: ZhongYingMatrix <zhongyingmatrix@gmail.com>
2025-03-06 09:35:49 -08:00
e642ec962c Add authors to license header. (#14371)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Burkhard Ringlein <ngl@zurich.ibm.com>
Co-authored-by: Jan van Lunteren <jvl@zurich.ibm.com>
2025-03-06 08:43:09 -08:00
ada19210a3 Adding cpu inference with VXE ISA for s390x architecture (#12613)
Signed-off-by: Dilip Gowda Bhagavan <dilip.bhagavan@ibm.com>
Signed-off-by: Rishika Kedia <rishika.kedia@in.ibm.com>
Co-authored-by: Rishika Kedia <rishika.kedia@in.ibm.com>
2025-03-06 08:40:53 -08:00
bf0560bda9 Reinstate best_of for V0 (#14356)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-03-06 08:34:22 -08:00
151b08e0fe [RLHF] use worker_extension_cls for compatibility with V0 and V1 (#14185)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-03-07 00:32:46 +08:00
81b2f4a45f [Doc] Fix date typo in README.md (#14366)
Signed-off-by: Jitse Klomp <jitse.klomp@conclusionxforce.nl>
2025-03-06 08:29:57 -08:00
82551ad616 [Core] Don't use cache during multi-modal profiling (#14336) 2025-03-06 08:03:31 -08:00
caac5c2e59 [Bugfix][Core] fix abort_seq_group and memory leak when n>1 (#14326)
Signed-off-by: courage17340 <courage17340@163.com>
2025-03-06 23:59:32 +08:00
6bd1dd9d26 [Kernel] [V1] Improved performance for V1 Triton (ROCm) backend (#14152) 2025-03-06 07:39:16 -08:00
4f27044aab [Doc] Correct beam_search using in generative_models.md (#14363) 2025-03-06 15:37:10 +00:00
0ddc991f5c [Doc] Update reasoning with stream example to use OpenAI library (#14077)
Signed-off-by: liuyanyi <wolfsonliu@163.com>
2025-03-06 13:20:37 +00:00
fa82b93853 [Frontend][Docs] Transcription API streaming (#13301)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-03-06 10:39:35 +00:00
69ff99fdcd [Core] Optimizing cross-attention QKVParallelLinear computation (#12325)
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: NickLucche <nick@nlucches-4xa100.c.openshift-330514.internal>
Co-authored-by: NickLucche <nick@nlucches-4xa100.c.openshift-330514.internal>
2025-03-06 09:37:26 +00:00
5d802522a7 [V1][VLM][Pixtral-HF] Support Pixtral-HF on V1 (#14275)
Signed-off-by: Linkun Chen <github@lkchen.net>
2025-03-06 08:58:41 +00:00
1769928079 [Model] Update Paligemma multimodal processing with PromptUpdate (#14015)
Signed-off-by: Kyle Huang <kylhuang@nvidia.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-03-06 08:31:38 +00:00
ed6ea06577 [Hardware] Update the flash attn tag to support Blackwell (#14244) 2025-03-05 22:01:37 -08:00
5ee10e990d [Bugfix][CI] ALiBi test case in xformers multi_query_kv_attention (#11301) 2025-03-05 20:00:53 -08:00
3dbd2d813a [V1] LoRA - Enable more V1 tests (#14315)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2025-03-06 11:55:42 +08:00
f5f7f00cd9 [Bugfix][Structured Output] Support outlines engine with reasoning outputs for DeepSeek R1 (#14114) 2025-03-06 03:49:20 +00:00
abcc61e0af [misc] Mention ray list nodes command to troubleshoot ray issues (#14318)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-03-06 02:00:36 +00:00
f6bb18fd9a [BugFix] MLA + V1, illegal memory access and accuracy issues (#14253)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-03-05 17:10:13 -08:00
71eaf8969b [Build] Add UV_HTTP_TIMEOUT to avoid timeout during installation (#13850)
Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
2025-03-05 17:09:29 -08:00
ca100c90fe Add benchmark for DeepGEMM and vLLM Block FP8 Dense GEMM (#13917)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-03-05 17:08:51 -08:00
ffad94397d [CI/Build] Use spawn multiprocessing mode for V1 test pipeline (#14243)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-03-05 17:08:02 -08:00
4dacaa4a83 [BugFix] Fix prefix caching V0 MLA (#14255)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Co-authored-by: Ying Zhong <zhongyingmatrix@gmail.com>
2025-03-05 17:07:42 -08:00
a7ea35aa67 [Bugfix] Remove num_tokens_across_dp (#14302)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-03-05 23:55:55 +00:00
1e3e76b6cc [Bugfix] Fix DeepSeek MTP crash when using TP1ModelRunner with CUDA graph due to shape mismatch (#14237)
Signed-off-by: pyc96 <pychen96@gmail.com>
2025-03-05 22:22:40 +00:00
53ea6ad830 [V1][Easy] Add empty allowed_token_ids in the v1 sampler test (#14308)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-03-05 21:41:18 +00:00
1b7624bf5c [misc] Add FlashMLA as a new option of VLLM_ATTENTION_BACKEND env (#14267) 2025-03-05 21:28:50 +00:00
ac60dc7fe1 [V1][BugFix] Fix for mixed top_k batch (#14301)
Signed-off-by: Nick Hill <nhill@redhat.com>


Co-authored-by: Ye Cao <caoye.cao@alibaba-inc.com>
2025-03-05 20:43:04 +00:00
a4f1ee35d6 Deprecate best_of Sampling Parameter in anticipation for vLLM V1 (#13997)
Signed-off-by: vincent-4 <vincentzhongy+githubvincent4@gmail.com>
Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Brayden Zhong <b8zhong@uwaterloo.ca>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-03-05 20:22:43 +00:00
a32c8669ca [V1][Minor] Remove obsolete FIXME comment (#14304)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-03-05 11:59:23 -08:00
ca2ca8de57 [Docs] Add Meta Slides (#14297)
Signed-off-by: simon-mo <simon.mo@hey.com>
2025-03-05 08:30:23 -08:00
f71b00a19e [Bugfix] Fix broken vision language example (#14292)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-03-05 15:57:10 +00:00
8f808cf86e prefix_caching.md: Fixed typo (#14293)
Signed-off-by: Daivid Savernin-Frenk <daivid.frank@TurboNext.ai>
2025-03-05 15:43:13 +00:00
7bab4bb048 [Misc] Add Qwen2MoeForCausalLM moe tuning support (#14276)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-03-05 23:11:29 +08:00
e17e4488bd [LoRA] Remove linear hack outside transformers backend (#14177)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-03-05 15:06:28 +00:00
257e200a25 [V1][Frontend] Add Testing For V1 Runtime Parameters (#14159)
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
2025-03-05 14:18:55 +00:00
47d4a7e004 Small update for external_launcher backend docs (#14288) 2025-03-05 21:30:00 +08:00
7f89a594dd [Doc] [3/N] Refer code examples for common cases in dev multimodal processor (#14278)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-03-05 12:29:50 +00:00
961644e6a8 [Doc] Update nginx guide: remove privileged from vllm container run and add target GPU ID (#14217)
Signed-off-by: Iacopo Poli <iacopo@lighton.ai>
2025-03-05 11:44:10 +00:00
8d6cd32b7b [Bugfix][V1] Fix allowed_token_ids for v1 Sampler (#14169)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-03-05 08:49:44 +00:00
ec79b67c77 [Misc][V1] Avoid using envs.VLLM_USE_V1 in mm processing (#14256)
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-03-05 07:37:16 +00:00
32985bed7c [Frontend] Allow return_tokens_as_token_ids to be passed as a request param (#14066)
Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
2025-03-05 06:30:40 +00:00
dae9ec464c Temporarily disable test_awq_gemm_opcheck (#14251)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-03-05 06:10:35 +00:00
6eaf93020d [platforms] improve rocm debugging info (#14257) 2025-03-04 21:32:18 -08:00
72c62eae5f [V1] EP/TP MoE + DP Attention (#13931) 2025-03-04 21:27:26 -08:00
0a995d5434 [Model] New model support for Phi-4-multimodal-instruct (#14119) 2025-03-04 20:57:01 -08:00
ade3f7d988 [V1][Bugfix] Do not reset prefix caching metrics (#14235) 2025-03-05 04:39:13 +00:00
0df25101d6 [Bugfix] Fix gptq_marlin for deepseek-v3 (#13750)
Signed-off-by: dangshunya <dangshunya@baichuan-inc.com>
Co-authored-by: dangshunya <dangshunya@baichuan-inc.com>
2025-03-05 12:25:53 +08:00
e123aafdf0 Disable GPTQ AllSpark kernels for CUDA Compiler < 12.0 (#14157)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-03-05 12:25:24 +08:00
5b143d33be Moved numba from common requirements to cuda/rocm specific requirements (#14199)
Signed-off-by: Nishidha Panpaliya <nishidha.panpaliya@partner.ibm.com>
2025-03-05 12:25:00 +08:00
eb59b5a6cb [misc] announce china meetup (#14248)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-03-05 10:33:50 +08:00
fbfc3ee37e [V1][TPU] TPU multimodal model support for ragged attention (#14158)
Signed-off-by: Michael Goin <mgoin64@gmail.com>
2025-03-04 19:58:48 -05:00
3e1d223626 [ROCm] Disable a few more kernel tests that are broken on ROCm (#14145)
Signed-off-by: Sage Moore <sage@neuralmagic.com>
2025-03-04 23:37:55 +00:00
4f5b059f14 Clean up unused padding_idx variables across many model definitions (#13240)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-03-04 21:27:00 +00:00
288ca110f6 [Security] Serialize using safetensors instead of pickle in Mooncake Pipe (#14228)
Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
2025-03-04 21:10:32 +00:00
c2bd2196fc [v1][Metrics] Add design doc (#12745)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
2025-03-04 20:36:55 +00:00
550c7ba3dc [Docs] Update Dockerfile dependency image (#14215)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-03-04 20:22:11 +00:00
e5b2f1601a [Frontend] Do prompt_logprobs clamping for chat as well as completions (#14225)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-03-04 20:13:06 +00:00
9badee53de Fix performance when --generation-config is not None (#14223)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-03-04 20:59:22 +01:00
beebf4742a [TPU][Profiler] Support start_profile/stop_profile in TPU worker (#13988)
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-03-04 14:40:06 -05:00
f89978ad7c add cutlass support for blackwell fp8 gemm (#13798) 2025-03-04 07:55:07 -08:00
b3cf368d79 [V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161) 2025-03-04 15:43:59 +00:00
c8525f06fc [V0][Metrics] Deprecate some questionable request time metrics (#14135)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-03-04 15:11:33 +00:00
5db6b2c961 [V1][BugFix] Fix remaining sync engine client shutdown errors/hangs (#13869)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-03-04 15:06:47 +00:00
6247bae6c6 [Bugfix] Restrict MacOS CPU detection (#14210)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-03-04 22:25:27 +08:00
3610fb4930 [doc] add "Failed to infer device type" to faq (#14200)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-03-04 20:47:06 +08:00
71c4b40562 [sleep mode] error out with expandable_segments (#14189)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-03-04 18:54:19 +08:00
ac65bc92df [platform] add debug logging during inferring the device type (#14195)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-03-04 18:39:16 +08:00
f78c0be80a Fix benchmark_moe.py tuning for CUDA devices (#14164) 2025-03-03 21:11:03 -08:00
66233af7b6 Use math.prod instead of np.prod for trivial ops (#14142) 2025-03-03 21:09:22 -08:00
bf13d40972 [core] Pass all driver env vars to ray workers unless excluded (#14099)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-03-04 11:44:17 +08:00
989f4f430c [Misc] Remove lru_cache in NvmlCudaPlatform (#14156)
Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
2025-03-04 11:09:34 +08:00
bb5b640359 [core] moe fp8 block quant tuning support (#14068)
Signed-off-by: Divakar Verma <divakar.verma@amd.com>
2025-03-04 01:30:23 +00:00
c060b71408 [Model] Add support for GraniteMoeShared models (#13313)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-03-04 08:04:52 +08:00
79e4937c65 [v1] Add comments to the new ragged paged attention Pallas kernel (#14155)
Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-03-03 23:00:55 +00:00
cd1d3c3df8 [Docs] Add GPTQModel (#14056)
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-03-03 21:59:09 +00:00
19d98e0c7d [Kernel] Optimize moe intermediate_cache usage (#13625)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-03-03 16:29:53 -05:00
2b04c209ee [Bugfix] Allow shared_experts skip quantization for DeepSeekV2/V3 (#14100)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-03-03 14:20:24 -07:00
ae122b1cbd [WIP][[V1][Metrics] Implement max_num_generation_tokens, request_params_n, and request_params_max_tokens metrics (#14055)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-03-03 19:04:45 +00:00
872db2be0e [V1] Simplify stats logging (#14082)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-03-03 10:34:14 -08:00
2dfdfed8a0 [V0][Metrics] Deprecate some KV/prefix cache metrics (#14136)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-03-03 18:25:46 +00:00
c41d27156b [V0][Metrics] Remove unimplemented vllm:tokens_total (#14134)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-03-03 17:50:22 +00:00
91373a0d15 Fix head_dim not existing in all model configs (Transformers backend) (#14141)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-03-03 17:48:11 +00:00
848a6438ae [ROCm] Faster Custom Paged Attention kernels (#12348) 2025-03-03 09:24:45 -08:00
98175b2816 Improve the docs for TransformersModel (#14147)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-03-03 17:03:05 +00:00
4167252eaf [V1] Refactor parallel sampling support (#13774)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-03-03 08:15:27 -08:00
f35f8e2242 [Build] Make sure local main branch is synced when VLLM_USE_PRECOMPILED=1 (#13921)
Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
2025-03-03 16:43:14 +08:00
b87c21fc89 [Misc][Platform] Move use allgather to platform (#14010)
Signed-off-by: Mengqing Cao <cmq0113@163.com>
2025-03-03 15:40:04 +08:00
e584b85afd [Misc] duplicate code in deepseek_v2 (#14106) 2025-03-03 14:10:11 +08:00
09e56f9262 [Bugfix] Explicitly include "omp.h" for MacOS to avoid installation failure (#14051) 2025-03-02 17:35:01 -08:00
cf069aa8aa Update deprecated Python 3.8 typing (#13971) 2025-03-02 17:34:51 -08:00
bf33700ecd [v0][structured output] Support reasoning output (#12955)
Signed-off-by: Ce Gao <cegao@tensorchord.ai>
2025-03-02 14:49:42 -05:00
bc6ccb9878 [Doc] Source building add clone step (#14086)
Signed-off-by: qux-bbb <1147635419@qq.com>
2025-03-02 10:59:50 +00:00
82fbeae92b [Misc] Accurately capture the time of loading weights (#14063)
Signed-off-by: Jun Duan <jun.duan.phd@outlook.com>
2025-03-01 17:20:30 -08:00
cc5e8f6db8 [Model] Add LoRA support for TransformersModel (#13770)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-03-02 09:17:34 +08:00
d54990da47 [v1] Add __repr__ to KVCacheBlock to avoid recursive print (#14081) 2025-03-01 20:46:02 +00:00
b9f1d4294e [v1][Bugfix] Only cache blocks that are not in the prefix cache (#14073) 2025-03-01 08:25:54 +00:00
b28246f6ff [ROCm][V1][Bugfix] Add get_builder_cls method to the ROCmAttentionBackend class (#14065)
Signed-off-by: Sage Moore <sage@neuralmagic.com>
2025-03-01 07:18:32 +00:00
3b5567a209 [V1][Minor] Do not print attn backend twice (#13985)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-03-01 07:09:14 +00:00
fdcc405346 [Doc] Consolidate whisper and florence2 examples (#14050) 2025-02-28 22:49:15 -08:00
8994dabc22 [Documentation] Add more deployment guide for Kubernetes deployment (#13841)
Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
Signed-off-by: Kuntai Du <kuntai@uchicago.edu>
2025-03-01 06:44:24 +00:00
02296f420d [Bugfix][V1][Minor] Fix shutting_down flag checking in V1 MultiprocExecutor (#14053) 2025-02-28 22:31:01 -08:00
6a92ff93e1 [Misc][Kernel]: Add GPTQAllSpark Quantization (#12931) 2025-02-28 22:30:59 -08:00
6a84164add [Bugfix] Add file lock for ModelScope download (#14060)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-03-01 06:10:28 +00:00
f64ffa8c25 [Docs] Add pipeline_parallel_size to optimization docs (#14059)
Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
2025-03-01 05:43:54 +00:00
bd56c983d6 [torch.compile] Fix RMSNorm + quant fusion in the non-cutlass-fp8 case, rename RedundantReshapesPass to NoopEliminationPass (#10902)
Signed-off-by: luka <luka@neuralmagic.com>
2025-02-28 16:20:11 -07:00
084bbac8cc [core] Bump ray to 2.43 (#13994)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-02-28 21:47:44 +00:00
28943d36ce [v1] Move block pool operations to a separate class (#13973)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
2025-02-28 20:53:31 +00:00
b526ca6726 Add RELEASE.md (#13926)
Signed-off-by: atalman <atalman@fb.com>
2025-02-28 12:25:50 -08:00
e7bd944e08 [v1] Cleanup the BlockTable in InputBatch (#13977)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-02-28 19:03:16 +00:00
c3b6559a10 [V1][TPU] Integrate the new ragged paged attention kernel with vLLM v1 on TPU (#13379)
Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-02-28 11:01:36 -07:00
4be4b26cb7 Fix entrypoint tests for embedding models (#14052) 2025-02-28 08:56:44 -08:00
2aed2c9fa7 [Doc] Fix ROCm documentation (#14041)
Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
2025-02-28 16:42:07 +00:00
9b61dd41e7 [Bugfix] Initialize attention bias on the same device as Query/Key/Value for QwenVL Series (#14031) 2025-02-28 07:36:08 -08:00
f7bee5c815 [VLM][Bugfix] Enable specifying prompt target via index (#14038) 2025-02-28 07:35:55 -08:00
e0734387fb [Bugfix] Fix MoeWNA16Method activation (#14024) 2025-02-28 15:22:42 +00:00
f58f8b5c96 Update AutoAWQ docs (#14042)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-02-28 15:20:29 +00:00
b3f7aaccd0 [V1][Minor] Restore V1 compatibility with LLMEngine class (#13090) 2025-02-28 00:52:25 -08:00
b91660ddb8 [Hardware][Intel-Gaudi] Regional compilation support (#13213) 2025-02-28 00:51:49 -08:00
76c89fcadd Use smaller embedding model when not testing model specifically (#13891) 2025-02-28 00:50:43 -08:00
b9e41734c5 [Bugfix][Disaggregated] patch the inflight batching on the decode node in SimpleConnector to avoid hangs in SimpleBuffer (nccl based) (#13987)
Signed-off-by: Mathis Felardos <mathis@mistral.ai>
2025-02-28 07:53:45 +00:00
1088f06242 [Doc] Move multimodal Embedding API example to Online Serving page (#14017)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-02-28 07:12:04 +00:00
73e0225ee9 [Bugfix] Check that number of images matches number of <|image|> tokens with mllama (#13911)
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
2025-02-28 04:00:45 +00:00
6c85da3a18 [V1]SupportsV0Only protocol for model definitions (#13959)
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-02-27 20:02:15 -05:00
67fc426845 [Misc] Print FusedMoE detail info (#13974) 2025-02-27 18:53:13 -05:00
9804145cac [Model][Speculative Decoding] Expand DeepSeek MTP code to support k > n_predict (#13626)
Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
2025-02-27 15:28:08 -08:00
2e94b9cfbb [Attention] Flash MLA for V1 (#13867)
Signed-off-by: Yang Chen <yangche@fb.com>
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Yang Chen <yangche@fb.com>
2025-02-27 23:03:41 +00:00
8294773e48 [core] Perf improvement for DSv3 on AMD GPUs (#13718)
Signed-off-by: qli88 <qiang.li2@amd.com>
2025-02-27 22:14:30 +00:00
cd813c6d4d [V1][Minor] Minor cleanup for GPU Model Runner (#13983)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-02-27 13:11:40 -08:00
38acae6e97 [ROCm] Fix the Kernels, Core, and Prefix Caching AMD CI groups (#13970)
Signed-off-by: Sage Moore <sage@neuralmagic.com>
2025-02-27 20:31:47 +00:00
a2dd48c386 [VLM] Deprecate legacy input mapper for OOT multimodal models (#13979)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-02-27 19:14:55 +00:00
126f6beeb4 Bump azure/setup-helm from 4.2.0 to 4.3.0 (#13742)
Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2025-02-27 19:04:10 +00:00
58d1b2aa77 [Attention] MLA support for V1 (#13789)
Signed-off-by: Yang Chen <yangche@fb.com>
2025-02-27 13:14:17 -05:00
f1579b229d [VLM] Generalized prompt updates for multi-modal processor (#13964)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-02-27 17:44:25 +00:00
7864875879 [Bugfix] Fix qwen2.5-vl overflow issue (#13968)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-02-27 17:30:39 +00:00
1dd422b64a Update LMFE version to v0.10.11 to support new versions of transforme… (#13930) 2025-02-27 17:16:12 +00:00
06c8f8d885 [bugfix] Fix profiling for RayDistributedExecutor (#13945)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-02-28 01:01:21 +08:00
5677c9bb3e Deduplicate .pre-commit-config.yaml's exclude (#13967)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-02-27 16:27:47 +00:00
512d77d582 Update quickstart.md (#13958) 2025-02-27 16:05:11 +00:00
7f0be2aa24 [Model] Deepseek GGUF support (#13167) 2025-02-27 02:08:35 -08:00
edf309ebbe [VLM] Support multimodal inputs for Florence-2 models (#13320) 2025-02-27 02:06:41 -08:00
788f284b53 Fix test_block_fp8.py test for MoE (#13915)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-02-27 18:00:00 +08:00
4b1d141f49 [PP] Correct cache size check (#13873)
Signed-off-by: Yang Zheng <zhengy.gator@gmail.com>
2025-02-27 17:47:29 +08:00
10c3b8c1cf [Misc] fixed 'required' is an invalid argument for positionals (#13948)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-02-27 09:06:49 +00:00
a7f37314b7 [CI/Build] Add examples/ directory to be labelled by mergify (#13944)
Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
2025-02-27 08:24:11 +00:00
cd711c48b2 [V1][Metrics] Handle preemptions (#13169) 2025-02-26 20:04:59 -08:00
378b3ef6f8 [ROCm][V1] Update reshape_and_cache to properly work with CUDA graph padding (#13922) 2025-02-26 20:04:12 -08:00
c9944acbf9 [misc] Rename Ray ADAG to Compiled Graph (#13928) 2025-02-26 20:03:28 -08:00
ca377cf1b9 Use CUDA 12.4 as default for release and nightly wheels (#12098) 2025-02-26 19:06:37 -08:00
a31614e386 [ROCm][Quantization][Kernel] Use FP8 FNUZ when OCP flag is 0 or undefined (#13851)
Signed-off-by: Hollow Man <hollowman@opensuse.org>
2025-02-27 10:39:10 +08:00
f95903909f [Kernel] FlashMLA integration (#13747)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-02-27 10:35:08 +08:00
b382a7f28f [BugFix] Make FP8 Linear compatible with torch.compile (#13918)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-02-26 13:48:55 -08:00
4cb6fa0a9c [Bugfix] Backend option to disable xgrammar any_whitespace (#12744)
Signed-off-by: Wallas Santos <wallashss@ibm.com>
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
Co-authored-by: Joe Runde <Joseph.Runde@ibm.com>
2025-02-26 10:52:34 -08:00
d08b285adf [Misc] fixed qwen_vl_utils parameter error (#13906) 2025-02-26 08:31:53 -08:00
b27122acc2 [TPU] use torch2.6 with whl package (#13860)
Signed-off-by: Chenyaaang <llccyy1212@gmail.com>
2025-02-26 08:18:54 -05:00
934bb99c71 [Bugfix] Update expected token counts for Ultravox tests (#13895) 2025-02-26 04:56:50 -08:00
3f808cc044 [Bugfix] Do not crash V0 engine on input errors (#13101)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
2025-02-26 19:07:29 +08:00
ec8a5e5386 [Misc]: Add support for goodput on guided benchmarking + TPOT calculation refactor (#13736)
Signed-off-by: Brayden Zhong <b8zhong@uwaterloo.ca>
2025-02-26 19:06:47 +08:00
215bf150a6 [Bugfix] Handle None parameters in Mistral function calls. (#13786) 2025-02-26 03:06:21 -08:00
0ecdd98031 Add comments on accessing kv_cache and attn_metadata (#13887)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-02-26 18:41:02 +08:00
7b700ec8c8 [Bugfix] Add test example for Ultravox v0.5 (#13890) 2025-02-26 02:31:43 -08:00
7ca1da020f [Misc] Fix input processing for Ultravox (#13871) 2025-02-25 23:56:34 -08:00
5157338ed9 [Misc] Improve LoRA spelling (#13831) 2025-02-25 23:43:01 -08:00
e206b54331 [v0][Core] Use xgrammar shared context to avoid copy overhead for offline engine (#13837)
Signed-off-by: Seth Kimmel <seth.kimmel3@gmail.com>
2025-02-26 14:58:24 +08:00
1d35662e6d [ROCm] Disable chunked prefill/prefix caching when running MLA on non-cuda platforms (#13844)
Signed-off-by: Sage Moore <sage@neuralmagic.com>
2025-02-26 14:56:58 +08:00
e656f638de [Doc] fix the incorrect module path of tensorize_vllm_model (#13863) 2025-02-25 22:56:19 -08:00
145944cb94 Improve pipeline partitioning (#13839) 2025-02-25 18:53:56 -08:00
094b7d9496 [Kernel][Build/CI] Bump CUTLASS to 3.8 and add initializers for cutlass epilogues (#13797) 2025-02-25 18:52:03 -08:00
e1fe7591f2 [Misc]Code Cleanup (#13859)
Signed-off-by: noemotiovon <noemotiovon@gmail.com>
Co-authored-by: noemotiovon <noemotiovon@gmail.com>
2025-02-26 10:44:30 +08:00
5629f26df7 [V1][Spec Decode] Change Spec Decode Rejection Sampling API (#13729) 2025-02-25 18:14:48 -08:00
9ba28043b5 [misc] Show driver IP info when Ray fails to allocate driver worker (#13858)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-02-26 09:53:43 +08:00
24679788ed DeepSeek V2/V3/R1 only place lm_head on last pp rank (#13833)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-02-26 01:24:57 +00:00
07c4353057 [Model] Support Grok1 (#13795)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-02-26 01:07:12 +00:00
34e3494e70 Fix failing MyGemma2Embedding test (#13820)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-02-25 12:33:03 -08:00
f75aa72732 [Neuron] Add custom_ops for neuron backend (#13246)
Signed-off-by: Liangfu Chen <liangfc@amazon.com>
Co-authored-by: George Novack <gnovack@amazon.com>
Co-authored-by: Aoyu Zhang <aoyuzhan@amazon.com>
2025-02-25 11:47:49 -08:00
340e39e387 Fix string parsing error (#13825) 2025-02-25 08:20:29 -08:00
f4133ce4e5 [Bugfix] Revert inspection code in #13743 (#13832)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-02-26 00:18:50 +08:00
6522d55b6f Fix /v1/audio/transcriptions Bad Request Error (#13811) 2025-02-25 06:03:33 -08:00
6ff518626c [Bugfix] Fix deepseek-vl2 inference with more than 2 images (#13818) 2025-02-25 06:03:02 -08:00
fa82074167 [Bugfix] Flush TunableOp results before worker processes are destroyed. (#13623)
Signed-off-by: Nichols A. Romero <nick.romero@amd.com>
2025-02-25 11:08:20 +00:00
75e9d49796 [Bugfix] Initialize attention bias on the same device as Query/Key/Value (#13468) 2025-02-25 02:13:09 -08:00
32c3b6bfd1 [Misc]Clarify Error Handling for Non-existent Model Paths and HF Repo IDs (#13724)
Signed-off-by: Chen-0210 <chenjincong11@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-02-25 10:12:19 +00:00
37b6cb4985 [CI/Build] Fix V1 LoRA failure (#13767) 2025-02-25 02:01:15 -08:00
aabeb2688f [ROCm][Quantization][Kernel] Using HIP FP8 header (#12593) 2025-02-25 00:39:59 -08:00
2f42a4888c [Feature] Support KV cache offloading and disagg prefill with LMCache connector. (#12953) 2025-02-25 00:38:42 -08:00
3173c3b34e [misc] Clean up ray compiled graph type hints (#13731) 2025-02-25 00:37:08 -08:00
2d87d7d1ac [Bugfix] Modify modelscope api usage in transformer_utils (#13807) 2025-02-25 00:36:07 -08:00
aab392774b [Core] xgrammar: Expand list of unsupported jsonschema keywords (#13783)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-02-25 08:21:25 +00:00
6724e79164 [Misc] Check that the model can be inspected upon registration (#13743) 2025-02-25 00:18:19 -08:00
03f48b3db6 [Core] LoRA V1 - Add add/pin/list/remove_lora functions (#13705) 2025-02-25 00:18:02 -08:00
4d251ad00e Fix CompressedTensorsWNA16MoE with grouped scales (#13769) 2025-02-25 00:17:14 -08:00
18e505930d [Bugfix] Support MLA for CompressedTensorsWNA16 (#13725)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-02-25 06:10:31 +00:00
4a8cfc7551 [Bugfix] Fix deepseek-v2 error: "missing 1 required positional argument: 'residual'" (#13802) 2025-02-24 20:33:59 -08:00
bc32bc73aa [V1][Metrics] Implement vllm:lora_requests_info metric (#13504) 2025-02-24 20:01:33 -08:00
ab1091d5f2 [Misc][Attention][Quantization] init property earlier (#13733)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-02-25 03:19:30 +00:00
1e15aaef56 [Bugfix][Quantization] Fix FP8 + EP (#13784)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-02-25 10:54:17 +08:00
51010a1807 [Misc] set single whitespace between log sentences (#13771)
Signed-off-by: cjackal <44624812+cjackal@users.noreply.github.com>
2025-02-25 10:26:12 +08:00
7196a3b1db [Doc] arg_utils.py: fixed a typo (#13785) 2025-02-24 18:23:04 -08:00
cdc1fa12eb Remove unused kwargs from model definitions (#13555) 2025-02-24 17:13:52 -08:00
f61528d46d [Misc][Chore] Clean Up AsyncOutputProcessing Logs (#13780) 2025-02-24 16:39:07 -08:00
1f0ae3ed0a [Misc] Clean Up EngineArgs.create_engine_config (#13734)
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
2025-02-24 13:52:21 -05:00
db986c19ea Fix precommit fail in fused_moe intermediate_cache2 chunking (#13772)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-02-24 09:25:47 -08:00
227578480d Revert "[V1][Core] Fix memory issue with logits & sampling" (#13775) 2025-02-24 09:16:05 -08:00
befc402d34 [V1] V1 engine implements parallel sampling (AsyncLLM and LLMEngine) (#10980)
Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-02-24 08:29:41 -08:00
444b0f0f62 [Misc][Docs] Raise error when flashinfer is not installed and VLLM_ATTENTION_BACKEND is set (#12513)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-02-24 10:43:21 -05:00
ccc00515fd [BugFix] Illegal memory access for MoE On H20 (#13693) 2025-02-24 07:37:32 -08:00
781096e385 Expert Parallelism (EP) Support for DeepSeek V2 (#12583) 2025-02-24 07:33:20 -08:00
7940d8a6a7 [CI/Build] add python-json-logger to requirements-common (#12842) 2025-02-24 06:10:33 -08:00
c0e3ecd6d2 [Bugfix] fix(logging): add missing opening square bracket (#13011) 2025-02-24 06:10:25 -08:00
23eca9cf68 [model][refactor] remove cuda hard code in models and layers (#13658) 2025-02-24 06:10:14 -08:00
437b76ff59 [V1][Core] Fix memory issue with logits & sampling (#13721) 2025-02-24 06:10:06 -08:00
f90a375593 [ci] Add logic to change model to S3 path only when S3 CI env var is on (#13727)
Signed-off-by: <>
Co-authored-by: EC2 Default User <ec2-user@ip-172-31-63-253.us-west-2.compute.internal>
2025-02-24 06:32:11 +00:00
e7ef74e26e Fix some issues with benchmark data output (#13641)
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-02-24 10:23:18 +08:00
cbae7af552 [V1][BugFix] Fix engine core client shutdown hangs (#13298)
Even though ZMQ context.destroy() is meant to close open sockets before terminating the context, it appears to be necessary to do this explicitly or else it can hang in the context.term() method.

Close zmq sockets explicitly before terminating context, make shutdown of client resource more robust, shut down engine core process prior to terminating zmq context.

Signed-off-by: Nick Hill <nhill@redhat.com>
2025-02-23 13:07:43 -08:00
eb24dc4a45 [v1] torchrun compatibility (#13642)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-02-23 22:47:24 +08:00
9bebc9512f [Misc] Deprecate --dataset from benchmark_serving.py (#13708)
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-02-23 13:32:20 +00:00
5a2ba16f5c [Core][Distributed] Use IPC (domain socket) ZMQ socket for local comms (#13688) 2025-02-23 02:54:29 -08:00
ba5106e519 [LMM] Implement merged multimodal processor for whisper (#13278) 2025-02-23 01:46:03 -08:00
d5ca2110f1 [Quant] BaiChuan SupportsQuant (#13710) 2025-02-22 19:21:15 -08:00
2c5e637b57 [ci] Use env var to control whether to use S3 bucket in CI (#13634) 2025-02-22 19:19:45 -08:00
322d2a27d6 [BugFix] Minor: logger import in attention backend (#13706)
Signed-off-by: Andy Lo <andy@mistral.ai>
2025-02-22 16:51:13 -08:00
82e0d601fc [CI/Build] Fix pre-commit errors from #13571 (#13709)
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-02-22 16:50:38 -08:00
78ac0f591d [CI/Build] fix uv caching in Dockerfile (#13611) 2025-02-22 08:25:20 -08:00
b56155e7f3 [XPU]fix setuptools version for xpu (#13548) 2025-02-22 08:05:35 -08:00
382f66fb08 [Bugfix] Fix boolean conversion for OpenVINO env variable (#13615) 2025-02-22 08:04:12 -08:00
8354f6640c [Doc] Dockerfile instructions for optional dependencies and dev transformers (#13699) 2025-02-22 06:04:31 -08:00
c904fdddf6 [ROCm] Apply FP8 weights padding to values not divisible by 512 bytes on ROCm (#13231) 2025-02-22 05:54:38 -08:00
558db8083c [V1][Kernel] Refactor the prefix_prefill kernel so that the caller no longer has to pass in the context lengths (#13095) 2025-02-22 05:25:41 -08:00
e109e598c7 [NVIDIA] Support nvfp4 cutlass gemm (#13571) 2025-02-22 05:24:05 -08:00
8db1b9d0a1 Support SSL Key Rotation in HTTP Server (#13495) 2025-02-22 05:17:44 -08:00
2382ad29d1 [ci] fix linter (#13701)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-02-22 20:28:59 +08:00
3e472d882a [core] set up data parallel communication (#13591)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-02-22 19:28:59 +08:00
7f6bae561c [CI/Build] Fix pre-commit errors (#13696) 2025-02-22 00:31:26 -08:00
105b8ce4c0 [Misc] Reduce LoRA-related static variable (#13166) 2025-02-22 00:21:30 -08:00
2cb8c1540e [Metrics] Add --show-hidden-metrics-for-version CLI arg (#13295) 2025-02-22 00:20:45 -08:00
1cd981da4f [V1][Metrics] Support vllm:cache_config_info (#13299) 2025-02-22 00:20:00 -08:00
fca20841c2 Correction to TP logic for Mamba Mixer 2 when Num Groups not divisible by TP Size (#13660) 2025-02-22 00:19:10 -08:00
da31b5333e [Bugfix] V1 Memory Profiling: V0 Sampler Integration without Rejection Sampler (#13594)
Signed-off-by: Jennifer Zhao <7443418+JenZhao@users.noreply.github.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
2025-02-22 00:08:29 -08:00
bb78fb318e [v1] Support allowed_token_ids in v1 Sampler (#13210)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-02-22 14:13:05 +08:00
8aca27fa11 [Bugfix] Fix benchmark script bug: inaccurate stats for vllm backend when max_model_len < input_len + output_len (#13691)
Signed-off-by: WangErXiao <863579016@qq.com>
2025-02-22 14:10:38 +08:00
95c617e04b [Misc] Bump compressed-tensors (#13619) 2025-02-21 22:09:04 -08:00
9a1f1da5d1 [Bugfix][Model] OLMo 2: split qkv correctly for GQA and MQA (#13687) 2025-02-21 22:07:45 -08:00
68d630a0c7 [ROCM] fix native attention function call (#13650) 2025-02-21 22:07:04 -08:00
68d535ef44 [Misc] Capture and log the time of loading weights (#13666) 2025-02-21 22:06:34 -08:00
c6ed93860f [Bugfix][API Server] Fix invalid usage of 'ge' and 'le' in port valid… (#13672) 2025-02-21 22:05:28 -08:00
0ffdf8ce0c [HTTP Server] Make model param optional in request (#13568) 2025-02-21 21:55:50 -08:00
8c0dd3d4df docs: Add a note on full CI run in contributing guide (#13646) 2025-02-21 21:53:59 -08:00
ada7c780d5 [Misc] Fix yapf linting tools etc not running on pre-commit (#13695)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-02-22 13:10:43 +08:00
288cc6c234 [Attention] MLA with chunked prefill (#12639)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Patrick Horn <patrick.horn@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-02-21 15:30:12 -08:00
900edbfa48 fix typo of grafana dashboard, with correct datasource (#13668)
Signed-off-by: John Zheng <john.zheng@hp.com>
2025-02-21 18:21:05 +00:00
b2c3fc5d65 [Bugfix][CPU] Fix cpu all-reduce using native pytorch implementation (#13586) 2025-02-20 22:24:17 -08:00
839b27c6cc [Kernel]Add streamK for block-quantized CUTLASS kernels (#12978) 2025-02-20 22:14:24 -08:00
34ad27fe83 [ci] Fix metrics test model path (#13635) 2025-02-20 22:12:10 -08:00
1c3c975766 [FEATURE] Enables /score endpoint for embedding models (#12846) 2025-02-20 22:09:47 -08:00
1cdc88614a Missing comment explaining VDR variable in GGUF kernels (#13290) 2025-02-20 22:06:54 -08:00
31aa045c11 [V1][Sampler] Avoid an operation during temperature application (#13587) 2025-02-20 22:05:56 -08:00
a30c093502 [Bugfix] Add mm_processor_kwargs to chat-related protocols (#13644) 2025-02-20 22:04:33 -08:00
c7b07a95a6 Use pre-commit to update requirements-test.txt (#13617) 2025-02-20 22:03:27 -08:00
27a09dc52c [NVIDIA] Fix an issue to use current stream for the nvfp4 quant (#13632) 2025-02-20 22:01:48 -08:00
981f3c831e [Misc] Adding script to setup ray for multi-node vllm deployments (#12913) 2025-02-20 21:16:40 -08:00
44c33f01f3 Add llmaz as another integration (#13643)
Signed-off-by: kerthcet <kerthcet@gmail.com>
2025-02-21 03:52:40 +00:00
33170081f1 [Neuron][Kernel] Vectorize KV cache load in FlashPagedAttention to maximize DMA bandwidth (#13245)
Signed-off-by: Lingfan Yu <lingfany@amazon.com>
2025-02-20 17:45:45 -08:00
71face8540 [Bugfix] Fix max_num_batched_tokens for MLA (#13620)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-02-20 17:45:20 -08:00
bfbc0b32c6 [Frontend] Add backend-specific options for guided decoding (#13505)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
2025-02-20 15:07:58 -05:00
6a417b8600 fix neuron performance issue (#13589) 2025-02-20 10:59:36 -08:00
d3ea50113c [V1][Minor] Print KV cache size in token counts (#13596)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-02-20 09:24:31 -08:00
34aad515c8 Update pre-commit's isort version to remove warnings (#13614) 2025-02-20 08:00:14 -08:00
ed6e9075d3 [Bugfix] Fix deepseekv3 grouped topk error (#13474)
Signed-off-by: Chen-XiaoBing <chenxb002@whu.edu.cn>
2025-02-20 06:47:01 -08:00
992e5c3d34 Merge similar examples in offline_inference into single basic example (#12737) 2025-02-20 04:53:51 -08:00
b69692a2d8 [Kernel] LoRA - Refactor sgmv kernels (#13110) 2025-02-20 07:28:06 -05:00
a64a84433d [2/n][ci] S3: Use full model path (#13564)
Signed-off-by: <>
2025-02-20 01:20:15 -08:00
aa1e62d0db [ci] Fix spec decode test (#13600) 2025-02-20 16:56:00 +08:00
497bc83124 [CI/Build] Use uv in the Dockerfile (#13566) 2025-02-19 23:05:44 -08:00
3738e6fa80 [API Server] Add port number range validation (#13506)
Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
2025-02-20 15:05:13 +08:00
0023cd2b9d [ROCm] MI300A compile targets deprecation (#13560) 2025-02-19 23:05:00 -08:00
041e294716 [Misc] add mm_processor_kwargs to extra_body for Qwen2.5-VL (#13533) 2025-02-19 23:04:30 -08:00
9621667874 [Misc] Warn if the vLLM version can't be retrieved (#13501)
Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
2025-02-20 06:24:48 +00:00
8c755c3b6d [bugfix] spec decode worker get tp group only when initialized (#13578) 2025-02-20 04:46:28 +00:00
ba81163997 [core] add sleep and wake up endpoint and v1 support (#12987)
Signed-off-by: youkaichao <youkaichao@gmail.com>
Signed-off-by: cennn <2523403608@qq.com>
Co-authored-by: cennn <2523403608@qq.com>
2025-02-20 12:41:17 +08:00
0d243f2a54 [ROCm][MoE] mi300 mixtral8x7B perf for specific BS (#13577)
Signed-off-by: Divakar Verma <divakar.verma@amd.com>
2025-02-20 04:01:02 +00:00
88f6ba3281 [ci] Add AWS creds for AMD (#13572) 2025-02-20 03:56:06 +00:00
512368e34a [Misc] Qwen2.5 VL support LoRA (#13261) 2025-02-19 18:37:55 -08:00
473f51cfd9 [3/n][CI] Load Quantization test models with S3 (#13570)
Signed-off-by: <>
Co-authored-by: EC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>
2025-02-20 10:12:30 +08:00
a4c402a756 [BugFix] Avoid error traceback in logs when V1 LLM terminates (#13565)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-02-20 00:49:01 +00:00
550d97eb58 [Misc] Avoid calling unnecessary hf_list_repo_files for local model path (#13348)
Signed-off-by: isotr0py <2037008807@qq.com>
2025-02-19 18:57:48 +00:00
fbbe1fbac6 [MISC] Logging the message about Ray teardown (#13502)
Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
2025-02-19 09:40:50 -08:00
01c184b8f3 Fix copyright year to auto get current year (#13561) 2025-02-19 16:55:34 +00:00
ad5a35c21b [doc] clarify multi-node serving doc (#13558)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-02-19 22:32:17 +08:00
5ae9f26a5a [Bugfix] Fix device ordinal for multi-node spec decode (#13269)
Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
2025-02-19 22:13:15 +08:00
377d10bd14 [VLM][Bugfix] Pass processor kwargs properly on init (#13516)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-02-19 13:13:50 +00:00
52ce14d31f [doc] clarify profiling is only for developers (#13554)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-02-19 20:55:58 +08:00
81dabf24a8 [CI/Build] force writing version file (#13544)
Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
2025-02-19 18:48:03 +08:00
423330263b [Feature] Pluggable platform-specific scheduler (#13161)
Signed-off-by: Yannick Schnider <yannick.schnider1@ibm.com>
Signed-off-by: Yannick Schnider <Yannick.Schnider1@ibm.com>
2025-02-19 17:16:38 +08:00
caf7ff4456 [V1][Core] Generic mechanism for handling engine utility (#13060)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-02-19 17:09:22 +08:00
f525c0be8b [Model][Speculative Decoding] DeepSeek MTP spec decode (#12755)
Signed-off-by: Lu Fang <fanglu@fb.com>
Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
2025-02-19 17:06:23 +08:00
983a40a8bb [Bugfix] Fix Positive Feature Layers in Llava Models (#13514)
Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
2025-02-19 08:50:07 +00:00
fdc5df6f54 use device param in load_model method (#13037) 2025-02-19 16:05:02 +08:00
3b05cd4555 [perf-benchmark] Fix ECR path for premerge benchmark (#13512)
Signed-off-by: <>
Co-authored-by: EC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>
2025-02-19 07:56:11 +00:00
d5d214ac7f [1/n][CI] Load models in CI from S3 instead of HF (#13205)
Signed-off-by: <>
Co-authored-by: EC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>
2025-02-19 07:34:59 +00:00
fd84857f64 [Doc] Add clarification note regarding paligemma (#13511) 2025-02-18 22:24:03 -08:00
8aada19dfc [ROCm][MoE configs] mi325 mixtral & mi300 qwen_moe (#13503) 2025-02-18 22:23:24 -08:00
9aa95b0e6a [perf-benchmark] Allow premerge ECR (#13509)
Signed-off-by: <>
Co-authored-by: EC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>
2025-02-19 05:13:41 +00:00
d0a7a2769d [Hardware][Gaudi][Feature] Support Contiguous Cache Fetch (#12139)
Signed-off-by: yuzhou <yuzhou@habana.ai>
Signed-off-by: zhouyu5 <yu.zhou@intel.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
2025-02-18 19:40:19 -08:00
00b69c2d27 [Misc] Remove dangling references to --use-v2-block-manager (#13492)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-02-19 03:37:26 +00:00
4c82229898 [V1][Spec Decode] Optimize N-gram matching with Numba (#13365)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-02-18 13:19:58 -08:00
c8d70e2437 Pin Ray version to 2.40.0 (#13490)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-02-18 12:50:31 -08:00
30172b4947 [V1] Optimize handling of sampling metadata and req_ids list (#13244)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-02-18 12:15:33 -08:00
a4d577b379 [V1][Tests] Adding additional testing for multimodal models to V1 (#13308)
Signed-off-by: andoorve <37849411+andoorve@users.noreply.github.com>
2025-02-18 09:53:14 -08:00
7b203b7694 [misc] fix debugging code (#13487)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-02-18 09:37:11 -08:00
4fb8142a0e [V1][PP] Enable true PP with Ray executor (#13472)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-02-18 09:15:32 -08:00
a02c86b4dd [CI/Build] migrate static project metadata from setup.py to pyproject.toml (#8772) 2025-02-18 08:02:49 -08:00
3809458456 [Bugfix] Fix invalid rotary embedding unit test (#13431)
Signed-off-by: Liangfu Chen <liangfc@amazon.com>
2025-02-18 11:52:03 +00:00
d3231cb436 [Bugfix] Handle content type with optional parameters (#13383)
Signed-off-by: Zifei Tong <zifeitong@gmail.com>
2025-02-18 11:29:13 +00:00
435b502a6e [ROCm] Make amdsmi import optional for other platforms (#13460) 2025-02-18 03:15:56 -08:00
29fc5772c4 [Bugfix] Remove noisy error logging during local model loading (#13458) 2025-02-18 03:15:48 -08:00
2358ca527b [Doc]: Improve feature tables (#13224)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-02-18 18:52:39 +08:00
8cf97f8661 [Bugfix] Fix failing transformers dynamic module resolving with spawn multiproc method (#13403)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-02-18 10:25:53 +00:00
e2603fefb8 [Bugfix] Ensure LoRA path from the request can be included in err msg (#13450)
Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
2025-02-18 16:19:15 +08:00
b53d79983c Add outlines fallback when JSON schema has enum (#13449)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-02-18 06:49:41 +00:00
9915912f7f [V1][PP] Fix & Pin Ray version in requirements-cuda.txt (#13436)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-02-17 21:58:06 -08:00
d1b649f1ef [Quant] Aria SupportsQuant (#13416) 2025-02-17 21:51:09 -08:00
ac19b519ed [core] fix sleep mode in pytorch 2.6 (#13456)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-02-18 13:48:10 +08:00
a1074b3efe [Bugfix] Only print out chat template when supplied (#13444) 2025-02-17 21:43:31 -08:00
00294e1bc6 [Quant] Arctic SupportsQuant (#13366) 2025-02-17 21:35:09 -08:00
88787bce1d [Quant] Molmo SupportsQuant (#13336) 2025-02-17 21:34:47 -08:00
932b51cedd [v1] fix parallel config rank (#13445)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-02-18 12:33:45 +08:00
7c7adf81fc [ROCm] fix get_device_name for rocm (#13438)
Signed-off-by: Divakar Verma <divakar.verma@amd.com>
2025-02-18 04:07:12 +00:00
67ef8f666a [Model] Enable quantization support for transformers backend (#12960) 2025-02-17 19:52:47 -08:00
efbe854448 [Misc] Remove dangling references to SamplingType.BEAM (#13402) 2025-02-17 19:52:35 -08:00
b3942e157e [Bugfix][CI][V1] Work around V1 + CUDA Graph + torch._scaled_mm fallback issue (#13425)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-02-18 00:32:48 +00:00
cd4a72a28d [V1][Spec decode] Move drafter to model runner (#13363)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-02-17 15:40:12 -08:00
6ac485a953 [V1][PP] Fix intermediate tensor values (#13417)
Signed-off-by: Cody Yu <hao.yu.cody@gmail.com>
2025-02-17 13:37:45 -08:00
4c21ce9eba [V1] Get input tokens from scheduler (#13339)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-02-17 11:01:07 -08:00
ce77eb9410 [Bugfix] Fix VLLM_USE_MODELSCOPE issue (#13384) 2025-02-17 14:22:01 +00:00
30513d1cb6 [Bugfix] fix xpu communicator (#13368)
Signed-off-by: yan ma <yan.ma@intel.com>
2025-02-17 20:59:18 +08:00
1f69c4a892 [Model] Support Mamba2 (Codestral Mamba) (#9292)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Yu Chin Fabian Lim <flim@sg.ibm.com>
2025-02-17 20:17:50 +08:00
7b623fca0b [VLM] Check required fields before initializing field config in DictEmbeddingItems (#13380) 2025-02-17 01:36:07 -08:00
238dfc8ac3 [MISC] tiny fixes (#13378) 2025-02-17 00:57:13 -08:00
45186834a0 Run v1 benchmark and integrate with PyTorch OSS benchmark database (#13068)
Signed-off-by: Huy Do <huydhn@gmail.com>
2025-02-17 08:16:32 +00:00
f857311d13 Fix spelling error in index.md (#13369) 2025-02-17 06:53:20 +00:00
46cdd59577 [Feature][Spec Decode] Simplify the use of Eagle Spec Decode (#12304)
Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
2025-02-16 19:32:26 -08:00
2010f04c17 [V1][Misc] Avoid unnecessary log output (#13289) 2025-02-16 19:26:24 -08:00
69e1d23e1e [V1][BugFix] Clean up rejection sampler & Fix warning msg (#13362)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-02-16 12:25:29 -08:00
d67cc21b78 [Bugfix][Platform][CPU] Fix cuda platform detection on CPU backend edge case (#13358)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-02-16 18:55:27 +00:00
e18227b04a [V1][PP] Cache Intermediate Tensors (#13353)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-02-16 10:02:27 -08:00
7b89386553 [V1][BugFix] Add __init__.py to v1/spec_decode/ (#13359)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-02-16 09:39:08 -08:00
da833b0aee [Docs] Change myenv to vllm. Update python_env_setup.inc.md (#13325) 2025-02-16 16:04:21 +00:00
5d2965b7d7 [Bugfix] Fix 2 Node and Spec Decode tests (#13341)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-02-16 22:20:22 +08:00
a0231b7c25 [platform] add base class for communicators (#13208)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-02-16 22:14:22 +08:00
124776ebd5 [ci] skip failed tests for flashinfer (#13352)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-02-16 22:09:15 +08:00
b7d309860e [V1] Update doc and examples for H2O-VL (#13349)
Signed-off-by: Roger Wang <ywang@roblox.com>
2025-02-16 10:35:54 +00:00
dc0f7ccf8b [BugFix] Enhance test_pos_encoding to support execution on multi-devices (#13187)
Signed-off-by: wchen61 <wchen61@foxmail.com>
2025-02-16 08:59:49 +00:00
d3d547e057 [Bugfix] Pin xgrammar to 0.1.11 (#13338) 2025-02-15 19:42:25 -08:00
12913d17ba [Quant] Add SupportsQuant to phi3 and clip (#13104) 2025-02-15 19:28:33 -08:00
80f63a3966 [V1][Spec Decode] Ngram Spec Decode (#12193)
Signed-off-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
2025-02-15 18:05:11 -08:00
367cb8ce8c [Doc] [2/N] Add Fuyu E2E example for multimodal processor (#13331) 2025-02-15 07:06:23 -08:00
54ed913f34 [ci/build] update flashinfer (#13323) 2025-02-15 05:33:13 -08:00
9206b3d7ec [V1][PP] Run engine busy loop with batch queue (#13064) 2025-02-15 03:59:01 -08:00
ed0de3e4b8 [AMD] [Model] DeepSeek tunings (#13199) 2025-02-15 03:58:09 -08:00
2ad1bc7afe [V1][Metrics] Add iteration_tokens_total histogram from V0 (#13288) 2025-02-15 03:56:19 -08:00
7fdaaf48ef [Bugfix] Fix qwen2.5-vl image processor (#13286) 2025-02-15 03:00:11 -08:00
067fa2255b [Bugfix]Fix search start_index of stop_checker (#13280) 2025-02-14 21:39:42 -08:00
9076325677 [BugFix] Don't scan entire cache dir when loading model (#13302) 2025-02-14 21:33:31 -08:00
97a3d6d995 [Bugfix] Massage MLA's usage of flash attn for RoCM (#13310) 2025-02-14 21:33:25 -08:00
579d7a63b2 [Bugfix][Docs] Fix offline Whisper (#13274) 2025-02-14 21:32:37 -08:00
c9f9d5b397 [Bugfix][AMD] Update torch_bindings so that scaled_fp4_quant isn't build on ROCm (#13235) 2025-02-14 20:30:42 -08:00
0c73026844 [V1][PP] Fix memory profiling in PP (#13315)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-02-14 20:17:25 -08:00
6a854c7a2b [V1][Sampler] Don't apply temp for greedy-only (#13311)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-02-14 18:10:53 -08:00
e7eea5a520 [V1][CI] Fix failed v1-test because of min_p (#13316)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-02-14 17:29:51 -08:00
a12934d3ec [V1][Core] min_p sampling support (#13191)
Signed-off-by: Aoyu <aoyuzhan@amazon.com>
Co-authored-by: Aoyu <aoyuzhan@amazon.com>
2025-02-14 15:50:05 -08:00
3bcb8c75da [Core] Reduce TTFT with concurrent partial prefills (#10235)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com>
Co-authored-by: Prashant Gupta <prashantgupta@us.ibm.com>
Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
2025-02-14 15:36:07 -08:00
5e5c8e091e [Quant][Perf] Use moe_wna16 kernel by default for MoEs with many experts (#13236)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-02-14 12:53:42 -08:00
c9e2d644e7 [Hardware][Gaudi][Bugfix] Fix error for guided decoding (#12317) 2025-02-14 04:36:49 -08:00
7734e9a291 [Core] choice-based structured output with xgrammar (#12632) 2025-02-14 04:36:05 -08:00
6224a9f620 Support logit_bias in v1 Sampler (#13079) 2025-02-14 04:34:59 -08:00
085b7b2d6c [V1] Simplify GPUModelRunner._update_states check (#13265) 2025-02-14 04:33:43 -08:00
4da1f667e9 [VLM] Keep track of whether prompt replacements have been applied (#13215) 2025-02-14 04:20:46 -08:00
556ef7f714 [Misc] Log time consumption of sleep and wake-up (#13115)
Signed-off-by: Jun Duan <jun.duan.phd@outlook.com>
2025-02-14 20:10:21 +08:00
83481ceb49 [Bugfix] Fix missing parentheses (#13263) 2025-02-14 01:07:10 -08:00
185cc19f92 [Frontend] Optionally remove memory buffer used for uploading to URLs in run_batch (#12927)
Signed-off-by: Pooya Davoodi <pooya.davoodi@parasail.io>
2025-02-14 08:22:42 +00:00
45f90bcbba [WIP] TPU V1 Support Refactored (#13049) 2025-02-14 00:21:53 -08:00
b0ccfc565a [Bugfix][V1] GPUModelRunner._update_states should return True when there is a finished request in batch (#13126) 2025-02-13 22:39:20 -08:00
ba59b78a9c [ROCm][V1] Add intial ROCm support to V1 (#12790) 2025-02-13 22:21:50 -08:00
cbc40128eb [V1] LoRA - Enable Serving Usecase (#12883)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2025-02-14 14:21:12 +08:00
f0b2da72a8 Expand MLA to support most types of quantization (#13181) 2025-02-13 22:19:22 -08:00
f2b20fe491 Consolidate Llama model usage in tests (#13094) 2025-02-13 22:18:03 -08:00
40932d7a05 [Misc] Remove redundant statements in scheduler.py (#13229) 2025-02-13 22:07:25 -08:00
84683fa271 [Bugfix] Offline example of disaggregated prefill (#13214) 2025-02-13 20:20:47 -08:00
067678262a [Bugfix][CI] Inherit codespell settings from pyproject.toml in the pre-commit-config (#13237) 2025-02-13 20:19:43 -08:00
09545c0a94 [Bugfix/CI] Turn test_compressed_tensors_2of4_sparse back on (#13250) 2025-02-13 20:19:25 -08:00
dd5ede4440 [V1] Consolidate MM cache size to vllm.envs (#13239) 2025-02-13 20:19:03 -08:00
8c32b08a86 [Kernel] Fix awq error when n is not divisable by 128 (#13227) 2025-02-13 20:07:05 -08:00
410886950a [ROCm] Avoid using the default stream on ROCm (#13238)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-02-14 09:29:26 +08:00
e38be640e6 Revert "Add label if pre-commit passes" (#13242) 2025-02-13 16:12:32 -08:00
c1e37bf71b [Kernel][Bugfix] Refactor and Fix CUTLASS 2:4 Sparse Kernels (#13198)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-02-14 00:01:14 +00:00
2344192a55 Optimize moe_align_block_size for deepseek_v3 (#12850)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-02-13 18:43:37 -05:00
bffddd9a05 Add label if pre-commit passes (#12527)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-02-13 20:51:30 +00:00
d84cef76eb [Frontend] Add /v1/audio/transcriptions OpenAI API endpoint (#12909) 2025-02-13 07:23:45 -08:00
37dfa60037 [Bugfix] Missing Content Type returns 500 Internal Server Error (#13193) 2025-02-13 06:52:22 -08:00
1bc3b5e71b [VLM] Separate text-only and vision variants of the same model architecture (#13157) 2025-02-13 06:19:15 -08:00
02ed8a1fbe [Misc] Qwen2.5-VL Optimization (#13155) 2025-02-13 06:17:57 -08:00
2092a6fa7d [V1][Core] Add worker_base for v1 worker (#12816)
Signed-off-by: Aoyu <aoyuzhan@amazon.com>
Signed-off-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Aoyu <aoyuzhan@amazon.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
2025-02-13 20:35:18 +08:00
c9d3ecf016 [VLM] Merged multi-modal processor for Molmo (#12966) 2025-02-13 04:34:00 -08:00
fdcf64d3c6 [V1] Clarify input processing and multimodal feature caching logic (#13211) 2025-02-13 03:43:24 -08:00
578087e56c [Frontend] Pass pre-created socket to uvicorn (#13113) 2025-02-13 00:51:46 -08:00
fa253f1a70 [VLM] Remove input processor from clip and siglip (#13165) 2025-02-13 00:31:37 -08:00
9605c1256e [V1][core] Implement pipeline parallel on Ray (#12996) 2025-02-13 08:02:46 +00:00
0ccd8769fb [CI/Build] Allow ruff to auto-fix some issues (#13180)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-02-13 07:45:38 +00:00
cb944d5818 Allow Unsloth Dynamic 4bit BnB quants to work (#12974) 2025-02-12 23:13:08 -08:00
d46d490c27 [Frontend] Move CLI code into vllm.cmd package (#12971) 2025-02-12 23:12:21 -08:00
04f50ad9d1 [Bugfix] deepseek_r1_reasoning_parser put reason content in wrong field in certain edge case (#13097) 2025-02-12 23:11:26 -08:00
60c68df6d1 [Build] Automatically use the wheel of the base commit with Python-only build (#13178) 2025-02-12 23:10:28 -08:00
009439caeb Simplify logic of locating CUDART so file path (#13203)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-02-13 13:52:41 +08:00
bc55d13070 [VLM] Implement merged multimodal processor for Mllama (#11427) 2025-02-12 20:26:21 -08:00
d88c8666a1 [Bugfix][Example] Fix GCed profiling server for TPU (#12792)
Signed-off-by: mgoin <michael@neuralmagic.com>
2025-02-13 11:52:11 +08:00
4fc5c23bb6 [NVIDIA] Support nvfp4 quantization (#12784) 2025-02-12 19:51:51 -08:00
9f9704dca6 [perf-benchmark] cleanup unused Docker images and volumes in H100 benchmark instance (#12706) 2025-02-12 19:51:33 -08:00
8eafe5eaea [CI/Build] Ignore ruff warning up007 (#13182)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-02-13 11:48:31 +08:00
4c0d93f4b2 [V1][Bugfix] Copy encoder input ids to fix set iteration issue during VLM abort (#13173)
Signed-off-by: andoorve <37849411+andoorve@users.noreply.github.com>
2025-02-12 12:58:11 -08:00
14b7899d10 [CI] Fix failing FP8 cpu offload test (#13170)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-02-12 19:16:06 +00:00
09972e716c [Bugfix] Allow fallback to AWQ from AWQMarlin at per-layer granularity (#13119) 2025-02-12 09:19:53 -08:00
36a08630e8 [CORE] [QUANT] Support for GPTQModel's dynamic quantization per module override/control (#7086) 2025-02-12 09:19:43 -08:00
2c2b560f48 [CI/Build] Use mypy matcher for pre-commit CI job (#13162)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-02-12 17:12:22 +00:00
042c3419fa Introduce VLLM_CUDART_SO_PATH to allow users specify the .so path (#12998)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-02-12 09:06:13 -08:00
82cabf53a3 [Misc] Delete unused LoRA modules (#13151) 2025-02-12 08:58:24 -08:00
314cfade02 [Frontend] Generate valid tool call IDs when using tokenizer-mode=mistral (#12332) 2025-02-12 08:29:56 -08:00
985b4a2b19 [Bugfix] Fix num video tokens calculation for Qwen2-VL (#13148)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-02-12 11:55:23 +00:00
f4d97e4fc2 [Bug] [V1] Try fetching stop_reason from EngineOutput before checking the request (#13108) 2025-02-12 02:39:16 -08:00
f1042e86f0 [Misc] AMD Build Improvements (#12923) 2025-02-12 02:36:10 -08:00
7c4033acd4 Further reduce the HTTP calls to huggingface.co (#13107) 2025-02-12 02:34:09 -08:00
d59def4730 Bump actions/setup-python from 5.3.0 to 5.4.0 (#12672) 2025-02-12 16:41:22 +08:00
0c7d9effce Bump helm/chart-testing-action from 2.6.1 to 2.7.0 (#12463) 2025-02-12 16:41:06 +08:00
dd3b4a01f8 Bump actions/stale from 9.0.0 to 9.1.0 (#12462) 2025-02-12 00:40:25 -08:00
a0597c6b75 Bump helm/kind-action from 1.10.0 to 1.12.0 (#11612) 2025-02-12 00:40:19 -08:00
e92694b6fe [Neuron][Kernel] Support Longer Sequences in NKI-based Flash PagedAttention and Improve Efficiency (#12921)
Signed-off-by: Lingfan Yu <lingfany@amazon.com>
2025-02-11 21:12:37 -08:00
842b0fd402 [ci] Add more source file dependencies for some tests (#13123)
Signed-off-by: <>
Co-authored-by: EC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>
2025-02-11 20:38:10 -08:00
974dfd4971 [Model] IBM/NASA Prithvi Geospatial model (#12830) 2025-02-11 20:34:30 -08:00
3ee696a63d [RFC][vllm-API] Support tokenizer registry for customized tokenizer in vLLM (#12518)
Signed-off-by: Keyun Tong <tongkeyun@gmail.com>
2025-02-12 12:25:58 +08:00
72c2b68dc9 [Misc] Move pre-commit suggestion back to the end (#13114)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-02-11 22:34:16 +00:00
14ecab5be2 [Bugfix] Guided decoding falls back to outlines when fails to import xgrammar (#12976)
Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
2025-02-11 18:17:44 +00:00
deb6c1c6b4 [Doc] Improve OpenVINO installation doc (#13102)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-02-11 18:02:46 +00:00
565c1efa65 [CI/Build][Bugfix] Fix CPU backend default threads num (#13077) 2025-02-11 16:55:56 +00:00
2b25b7d2e1 Fix initializing GGUF weights for ColumnParallelLinear when using tensor parallel > 1 (#13023) 2025-02-11 08:38:48 -08:00
6c4dbe23eb [BugFix] Pop instead of del CUDA_VISIBLE_DEVICES (#12962)
Signed-off-by: Hollow Man <hollowman@opensuse.org>
2025-02-12 00:21:50 +08:00
21f5d50fa5 [Bugfix] Do not use resource module on Windows (#12858) (#13029) 2025-02-11 08:21:18 -08:00
bf3e05215c [Misc] Fix typo at comments at metrics.py (#13024) 2025-02-11 08:20:37 -08:00
ad9776353e Set torch_dtype in TransformersModel (#13088)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-02-11 23:51:19 +08:00
75e6e14516 [V1][Metrics] Add several request timing histograms (#12644)
Signed-off-by: Mark McLoughlin <markmc@redhat.com>
2025-02-11 10:14:00 -05:00
110f59a33e [Bugfix] fix flaky test (#13089)
Signed-off-by: மனோஜ்குமார் பழனிச்சாமி <smartmanoj42857@gmail.com>
2025-02-11 14:41:20 +00:00
2e3b969ec0 [Platform] add pre_register_and_update function (#12432)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-02-11 22:06:46 +08:00
da317197dd [Build] Fix cuda link target of cumem_allocator in CPU env (#12863)
Signed-off-by: YuhongGuo <yuhong.gyh@antgroup.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-02-11 21:55:57 +08:00
7539bbc6a6 [ROCm] Using a more precise memory profiling (#12624)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-02-11 21:47:10 +08:00
9cf4759493 [executor] init local_rank as device index (#13027)
Signed-off-by: Mengqing Cao <cmq0113@163.com>
2025-02-11 21:20:53 +08:00
41c5dd45b9 [V1][Metrics] Add GPU prefix cache hit rate % gauge (#12592) 2025-02-11 08:27:25 +00:00
fc6485d277 [Bugfix]: Reasoning output bug according to the chat template change (#13025)
Signed-off-by: Ce Gao <cegao@tensorchord.ai>
2025-02-11 15:49:03 +08:00
78a141d768 [Misc] LoRA - Refactor Punica ops tests (#12970)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2025-02-11 07:26:03 +00:00
c320ca8edd [Core] Don't do platform detection at import time (#12933)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-02-11 07:25:25 +00:00
58047c6f04 [Benchmark] Add BurstGPT to benchmark_serving (#13063)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
2025-02-10 21:25:30 -08:00
cb080f32e3 [Bugfix] Support missing tool parameters in mistral tokenizer (#12884)
Signed-off-by: Florian Greinacher <florian.greinacher@siemens.com>
2025-02-11 03:33:33 +00:00
2c0f58203c [Docs] Annouce Meta Meetup (#13065)
Signed-off-by: simon-mo <simon.mo@hey.com>
2025-02-10 18:24:29 -08:00
2ff4857678 [V1][Minor] Move scheduler outputs to a separate file (#13062)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-02-11 02:10:06 +00:00
91e876750e [misc] Fix setup.py condition to avoid AMD from being mistaken with CPU (#13022)
Signed-off-by: kevin <kevin@anyscale.com>
2025-02-10 18:06:16 -08:00
08b2d845d6 [Model] Ultravox Model: Support v0.5 Release (#12912)
Signed-off-by: Farzad Abdolhosseini <farzad@fixie.ai>
2025-02-10 22:02:48 +00:00
2ae889052c Fix seed parameter behavior in vLLM (#13007)
Signed-off-by: மனோஜ்குமார் பழனிச்சாமி <smartmanoj42857@gmail.com>
2025-02-10 23:26:50 +08:00
51f0b5f7f6 [Bugfix] Clean up and fix multi-modal processors (#13012)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-02-10 10:45:21 +00:00
fde71262e0 [misc] Add retries with exponential backoff for HF file existence check (#13008) 2025-02-10 01:15:02 -08:00
243137143c [Doc] Add link to tool_choice tracking issue in tool_calling.md (#13003)
Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
2025-02-10 06:09:33 +00:00
b2496bb07f [core] fix sleep mode and pytorch checkpoint compatibility (#13001)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-02-10 13:03:43 +08:00
44607e07d3 Check if selected backend is None in get_attn_backend_cls() (#12975)
Signed-off-by: Yuan Tang <terrytangyuan@gmail.com>
2025-02-10 11:45:07 +08:00
67c4637ccf [V1] Use msgpack for core request serialization (#12918)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-02-10 11:35:56 +08:00
aa0ca5ebb7 [core][rlhf] add colocate example for RLHF (#12984)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-02-10 10:28:59 +08:00
59fff4a01a [core] improve error handling when wake up from sleep mode (#12981)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-02-10 09:38:57 +08:00
29f1d47e73 [MISC] Always import version library first in the vllm package (#12979)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-02-09 18:56:40 +08:00
cf797aa856 [core] port pynvml into vllm codebase (#12963)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-02-09 15:00:00 +08:00
24700c346b [V1] Cache uses_mrope in GPUModelRunner (#12969) 2025-02-08 15:32:32 -08:00
d366ccc4e3 [RFC] [Mistral] FP8 format (#10130)
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-02-08 14:12:53 -07:00
870c37481e [V1][Minor] Remove outdated comment (#12968)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-02-08 12:48:30 -08:00
86222a3dab [VLM] Merged multi-modal processor for GLM4V (#12449)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-02-08 20:32:16 +00:00
fe743b798d [bugfix] fix early import of flash attention (#12959)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-02-09 00:06:56 +08:00
913df14da3 [Bugfix] Remove unused seq_group_metadata_list from ModelInputForGPU (#12935)
Signed-off-by: Shangming Cai <caishangming@linux.alibaba.com>
2025-02-08 14:46:19 +00:00
8a69e0e20e [CI/Build] Auto-fix Markdown files (#12941) 2025-02-08 04:25:15 -08:00
4c8dd12ef3 [Misc] Add qwen2.5-vl BNB support (#12944) 2025-02-08 04:24:47 -08:00
256a2d29dc [Doc] Correct HF repository for TeleChat2 models (#12949) 2025-02-08 01:42:15 -08:00
c45d398e6f [CI] Resolve transformers-neuronx version conflict (#12925) 2025-02-08 01:41:35 -08:00
011e612d92 [Misc] Log time consumption on weight downloading (#12926) 2025-02-08 09:16:42 +00:00
7e1837676a [misc] Add LoRA to benchmark_serving (#12898)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2025-02-08 17:15:44 +08:00
2880e21e3d [Hardware][Intel-Gaudi] Enable long-contexts + LoRA support for Intel Gaudi (#12812)
Signed-off-by: Sanju C Sudhakaran <scsudhakaran@habana.ai>
2025-02-08 17:15:30 +08:00
407b5537db [Build] Make pypi install work on CPU platform (#12874) 2025-02-08 01:15:15 -08:00
4ea48fb35c [V1][Minor] Move cascade attn logic outside _prepare_inputs (#12943)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-02-08 00:39:09 -08:00
e31498bdcb [Misc] Add offline test for disaggregated prefill (#12418) 2025-02-08 08:38:20 +00:00
91dd8f7aa6 [bugfix] respect distributed_executor_backend in world_size=1 (#12934)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-02-08 16:17:08 +08:00
d01f66b039 [Bugfix] Fix multi-round chat error when mistral tokenizer is used (#12859)
Signed-off-by: Zifei Tong <zifeitong@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-02-08 07:04:34 +00:00
cc01223f3b [Misc] Fix typo in the example file (#12896)
Signed-off-by: Zhao Ke <yingxiongraomingzk@gmail.com>
2025-02-08 06:56:43 +00:00
306923da82 [Bugfix] Fix Qwen2_5_VLForConditionalGeneration packed_modules_mapping (#12905) 2025-02-07 21:02:53 -08:00
3243158336 [V1] Move KV block hashes from Request to KVCacheManager (#12922)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-02-07 19:14:10 -08:00
b21f0f9d17 [V1][Minor] Remove outdated comment (#12928)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-02-07 19:07:37 -08:00
45cbc4991d [Bugfix] Fix disagg hang caused by the prefill and decode communication issues (#12723)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-02-07 16:39:50 -08:00
932c6b7461 [V1] LM Eval With Streaming Integration Tests (#11590) 2025-02-07 15:07:03 -08:00
eaa92d4437 [ROCm] [Feature] [Doc] [Dockerfile] [BugFix] Support Per-Token-Activation Per-Channel-Weight FP8 Quantization Inferencing (#12501) 2025-02-07 08:13:43 -08:00
0630d4537a [V1] Logprobs and prompt logprobs support (#9880)
This PR is adding support for sample logprobs & prompt logprobs to vLLM v1.

New behavior:

- During model execution, model runner computes sample logprobs (if user-provided logprobs setting is not None) and prompt logprobs (if user-provided prompt_logprobs setting is not None). For both sample and prompt logprobs, the engine core returns 3 vectors: token ids, token logprob values, token ranks. Ranks reflect tokens' 1-indexed positions in the vocabulary vector after sorting the vocabulary by log probability in descending order.
- In scheduler.update_from_output(), sample and prompt logprobs are incorporated into the EngineCoreOutput data structure which is transferred to the engine client. If multiprocessing is enabled, then sample and prompt logprobs will be (de)serialized when the EngineCoreOutput data structure is (de)serialized.
- During output processing, the LogprobsProcessor transforms the triplet of token ids, token logprobs values, and token ranks into the OpenAI-compatible List[Dict[token id,Logprob]] format (for sample and prompt logprobs respectively.)
- Each Logprob instance (whether sample- or prompt-) consists of a token's log-probability, rank, and detokenized string representation. Note that logprob detokenization is handled by the LogprobsProcessor not the detokenizer.

Signed-off-by: Andrew Feldman <afeldman@neuralmagic.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>


Co-authored-by: rshaw@neuralmagic.com <rshaw@neuralmagic.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-02-07 07:26:20 -08:00
538fab93cd PR #12718 (#12718) 2025-02-07 06:22:37 -08:00
ce26b16268 [Misc] Remove unnecessary detokenization in multimodal processing (#12868) 2025-02-07 06:21:17 -08:00
1918aa1b80 [MISC][EASY] Break check file names into entry and args in the pre-commit hooks (#12880)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-02-07 13:04:39 +00:00
6e1fc61f0f Prevent unecessary requests to huggingface hub (#12837) 2025-02-06 21:37:41 -08:00
aa375dca9f [Bugfix] Missing quant_config in deepseek embedding layer (#12836) 2025-02-06 21:35:09 -08:00
433c4a4923 Make vllm compatible with verl (#12824)
Co-authored-by: zhangshulai <zhangshulai@bytedance.com>
2025-02-07 11:54:20 +08:00
ef533d25fb [Bugfix] FA2 illegal memory access (#12848) 2025-02-06 19:54:07 -08:00
b260782357 [misc] Revert # 12833 (#12857)
Signed-off-by: <>
Co-authored-by: EC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>
2025-02-06 16:29:12 -08:00
741429a4cd [MISC] Check space in the file names in the pre commit checks (#12804)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-02-06 15:36:21 -08:00
aff404571b Add Bamba Model (#10909)
Signed-off-by: Yu Chin Fabian Lim <flim@sg.ibm.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-02-06 15:22:42 -08:00
467a96a541 [V1] LoRA Support (#10957)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2025-02-06 09:32:51 -08:00
8108ac841d [Bugfix] Fix unsupported FA version check for Turing GPU (#12828) 2025-02-06 09:18:22 -08:00
afe74f7a96 [Doc] double quote cmake package in build.inc.md (#12840) 2025-02-06 09:17:55 -08:00
09b95e36ab [torch.compile] PyTorch 2.6 and nightly compatibility (#12393)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-02-07 01:09:07 +08:00
85ac82d228 [Kernel] Make rotary_embedding ops more flexible with input shape (#12777) 2025-02-06 08:46:13 -08:00
1e57b1ee63 [Misc] Remove unnecessary decode call (#12833) 2025-02-06 08:45:44 -08:00
e152f29502 [misc] Reduce number of config file requests to HuggingFace (#12797)
Signed-off-by: EC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>
Signed-off-by: <>
Co-authored-by: EC2 Default User <ec2-user@ip-172-31-20-117.us-west-2.compute.internal>
2025-02-06 14:59:18 +00:00
c786e757fa [Attention] Use FA3 for MLA on Hopper (#12807)
Signed-off-by: Lucas Wilkinson <lwilkinson@neuralmagic.com>
2025-02-06 11:43:12 +00:00
cefd56ee35 [Docs] Add Google Cloud Slides (#12814) 2025-02-06 01:02:38 -08:00
7ca9934fe7 [Misc] Update w2 scale loading for GPTQMarlinMoE (#12757) 2025-02-06 01:02:14 -08:00
1289 changed files with 108941 additions and 26563 deletions

View File

@ -4,8 +4,8 @@ tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.233
value: 0.231
- name: "exact_match,flexible-extract"
value: 0.236
value: 0.22
limit: 1000
num_fewshot: 5

View File

@ -13,6 +13,7 @@ from pathlib import Path
import lm_eval
import numpy
import pytest
import yaml
RTOL = 0.05
@ -46,6 +47,10 @@ def test_lm_eval_correctness():
eval_config = yaml.safe_load(
Path(TEST_DATA_FILE).read_text(encoding="utf-8"))
if eval_config[
"model_name"] == "nm-testing/Meta-Llama-3-70B-Instruct-FBGEMM-nonuniform": #noqa: E501
pytest.skip("FBGEMM is currently failing on main.")
# Launch eval requests.
results = launch_lm_eval(eval_config)

View File

@ -1,15 +1,13 @@
# vLLM benchmark suite
## Introduction
This directory contains two sets of benchmark for vllm.
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
## Performance benchmark quick overview
@ -19,17 +17,14 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performan
**For benchmarking developers**: please try your best to constraint the duration of benchmarking to about 1 hr so that it won't take forever to run.
## Nightly benchmark quick overview
**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
**Benchmarking Coverage**: Fix-qps serving on A100 (the support for FP8 benchmark on H100 is coming!) on Llama-3 8B, 70B and Mixtral 8x7B.
**Benchmarking engines**: vllm, TGI, trt-llm and lmdeploy.
**Benchmarking Duration**: about 3.5hrs.
## Trigger the benchmark
Performance benchmark will be triggered when:
@ -39,16 +34,11 @@ Performance benchmark will be triggered when:
Nightly benchmark will be triggered when:
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
## Performance benchmark details
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
#### Latency test
### Latency test
Here is an example of one test inside `latency-tests.json`:
@ -68,23 +58,25 @@ Here is an example of one test inside `latency-tests.json`:
```
In this example:
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
- The `test_name` attributes is a unique identifier for the test. In `latency-tests.json`, it must start with `latency_`.
- The `parameters` attribute control the command line arguments to be used for `benchmark_latency.py`. Note that please use underline `_` instead of the dash `-` when specifying the command line arguments, and `run-performance-benchmarks.sh` will convert the underline to dash when feeding the arguments to `benchmark_latency.py`. For example, the corresponding command line arguments for `benchmark_latency.py` will be `--model meta-llama/Meta-Llama-3-8B --tensor-parallel-size 1 --load-format dummy --num-iters-warmup 5 --num-iters 15`
Note that the performance numbers are highly sensitive to the value of the parameters. Please make sure the parameters are set correctly.
WARNING: The benchmarking script will save json results by itself, so please do not configure `--output-json` parameter in the json file.
### Throughput test
#### Throughput test
The tests are specified in `throughput-tests.json`. The syntax is similar to `latency-tests.json`, except for that the parameters will be fed forward to `benchmark_throughput.py`.
The number of this test is also stable -- a slight change on the value of this number might vary the performance numbers by a lot.
#### Serving test
### Serving test
We test the throughput by using `benchmark_serving.py` with request rate = inf to cover the online serving overhead. The corresponding parameters are in `serving-tests.json`, and here is an example:
```
```json
[
{
"test_name": "serving_llama8B_tp1_sharegpt",
@ -109,6 +101,7 @@ We test the throughput by using `benchmark_serving.py` with request rate = inf t
```
Inside this example:
- The `test_name` attribute is also a unique identifier for the test. It must start with `serving_`.
- The `server-parameters` includes the command line arguments for vLLM server.
- The `client-parameters` includes the command line arguments for `benchmark_serving.py`.
@ -118,36 +111,33 @@ The number of this test is less stable compared to the delay and latency benchma
WARNING: The benchmarking script will save json results by itself, so please do not configure `--save-results` or other results-saving-related parameters in `serving-tests.json`.
#### Visualizing the results
### Visualizing the results
The `convert-results-json-to-markdown.py` helps you put the benchmarking results inside a markdown table, by formatting [descriptions.md](tests/descriptions.md) with real benchmarking results.
You can find the result presented as a table inside the `buildkite/performance-benchmark` job page.
If you do not see the table, please wait till the benchmark finish running.
The json version of the table (together with the json version of the benchmark) will be also attached to the markdown file.
The raw benchmarking results (in the format of json files) are in the `Artifacts` tab of the benchmarking.
## Nightly test details
See [nightly-descriptions.md](nightly-descriptions.md) for the detailed description on test workload, models and docker containers of benchmarking other llm engines.
### Workflow
#### Workflow
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
#### Nightly tests
### Nightly tests
In [nightly-tests.json](tests/nightly-tests.json), we include the command line arguments for benchmarking commands, together with the benchmarking test cases. The format is highly similar to performance benchmark.
#### Docker containers
### Docker containers
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).

View File

@ -10,12 +10,18 @@ steps:
- image: badouralix/curl-jq
command:
- sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
- label: "Cleanup H100"
agents:
queue: H100
depends_on: ~
command: docker system prune -a --volumes --force
- label: "A100"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: A100
depends_on: wait-for-container-image
if: build.branch == "main"
plugins:
- kubernetes:
podSpec:
@ -50,6 +56,7 @@ steps:
agents:
queue: H200
depends_on: wait-for-container-image
if: build.branch == "main"
plugins:
- docker#v5.12.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
@ -75,6 +82,7 @@ steps:
agents:
queue: H100
depends_on: wait-for-container-image
if: build.branch == "main"
plugins:
- docker#v5.12.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:$BUILDKITE_COMMIT
@ -90,3 +98,87 @@ steps:
environment:
- VLLM_USAGE_SOURCE
- HF_TOKEN
# Premerge benchmark
- label: "A100"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: A100
depends_on: wait-for-container-image
if: build.branch != "main"
plugins:
- kubernetes:
podSpec:
priorityClassName: perf-benchmark
containers:
- image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
command:
- bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
resources:
limits:
nvidia.com/gpu: 8
volumeMounts:
- name: devshm
mountPath: /dev/shm
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
nodeSelector:
nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
volumes:
- name: devshm
emptyDir:
medium: Memory
- label: "H200"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: H200
depends_on: wait-for-container-image
if: build.branch != "main"
plugins:
- docker#v5.12.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
command:
- bash
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
mount-buildkite-agent: true
propagate-environment: true
ipc: host
gpus: 4,5,6,7
volumes:
- /data/benchmark-hf-cache:/root/.cache/huggingface
environment:
- VLLM_USAGE_SOURCE
- HF_TOKEN
#- block: "Run H100 Benchmark"
#key: block-h100
#depends_on: ~
- label: "H100"
# skip: "use this flag to conditionally skip the benchmark step, useful for PR testing"
agents:
queue: H100
depends_on: wait-for-container-image
if: build.branch != "main"
plugins:
- docker#v5.12.0:
image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
command:
- bash
- .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
mount-buildkite-agent: true
propagate-environment: true
ipc: host
gpus: all # see CUDA_VISIBLE_DEVICES for actual GPUs used
volumes:
- /data/benchmark-hf-cache:/root/.cache/huggingface
environment:
- VLLM_USAGE_SOURCE
- HF_TOKEN

View File

@ -9,20 +9,19 @@ This file contains the downloading link for benchmarking results.
Please download the visualization scripts in the post
## Results reproduction
- Find the docker we use in `benchmarking pipeline`
- Deploy the docker, and inside the docker:
- Download `nightly-benchmarks.zip`.
- In the same folder, run the following code
```
export HF_TOKEN=<your HF token>
apt update
apt install -y git
unzip nightly-benchmarks.zip
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
```
- Download `nightly-benchmarks.zip`.
- In the same folder, run the following code:
```console
export HF_TOKEN=<your HF token>
apt update
apt install -y git
unzip nightly-benchmarks.zip
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
```
And the results will be inside `./benchmarks/results`.

View File

@ -2,6 +2,7 @@
# Nightly benchmark
This benchmark aims to:
- Provide performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and SGLang) leads in performance in what workload.
- Be reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions.
@ -9,7 +10,6 @@ Latest results: [results link](https://blog.vllm.ai/2024/09/05/perf-update.html)
Latest reproduction guilde: [github issue link](https://github.com/vllm-project/vllm/issues/8176)
## Setup
- Docker images:
@ -33,7 +33,7 @@ Latest reproduction guilde: [github issue link](https://github.com/vllm-project/
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
# Known issues
## Known issues
- TRT-LLM crashes with Llama 3.1 8B [issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105).
- TGI does not support `ignore-eos` flag.
- TGI does not support `ignore-eos` flag.

View File

@ -7,10 +7,8 @@
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- Evaluation metrics: end-to-end latency (mean, median, p99).
{latency_tests_markdown_table}
## Throughput tests
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
@ -19,10 +17,8 @@
- Models: llama-3.1 8B, llama-3 70B, mixtral 8x7B.
- Evaluation metrics: throughput.
{throughput_tests_markdown_table}
## Serving tests
- Input length: randomly sample 200 prompts from ShareGPT dataset (with fixed random seed).
@ -33,13 +29,11 @@
- We also added a speculative decoding test for llama-3 70B, under QPS 2
- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
{serving_tests_markdown_table}
## json version of the benchmarking tables
This section contains the data of the markdown tables above in JSON format.
This section contains the data of the markdown tables above in JSON format.
You can load the benchmarking tables into pandas dataframes as follows:
```python
@ -54,9 +48,9 @@ serving_results = pd.DataFrame.from_dict(benchmarking_results["serving"])
```
The json string for all benchmarking tables:
```json
{benchmarking_results_in_json_string}
```
You can also check the raw experiment data in the Artifact tab of the Buildkite page.

View File

@ -84,8 +84,13 @@ if __name__ == "__main__":
# this result is generated via `benchmark_serving.py`
# attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
try:
with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
except OSError as e:
print(e)
continue
raw_result.update(command)
# update the test name of this result
@ -99,8 +104,13 @@ if __name__ == "__main__":
# this result is generated via `benchmark_latency.py`
# attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
try:
with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
except OSError as e:
print(e)
continue
raw_result.update(command)
# update the test name of this result
@ -121,8 +131,13 @@ if __name__ == "__main__":
# this result is generated via `benchmark_throughput.py`
# attach the benchmarking command to raw_result
with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
try:
with open(test_file.with_suffix(".commands")) as f:
command = json.loads(f.read())
except OSError as e:
print(e)
continue
raw_result.update(command)
# update the test name of this result

View File

@ -426,7 +426,7 @@ main() {
pip install -U transformers
pip install -r requirements-dev.txt
pip install -r requirements/dev.txt
which genai-perf
# check storage

View File

@ -309,11 +309,14 @@ run_serving_tests() {
new_test_name=$test_name"_qps_"$qps
# pass the tensor parallel size to the client so that it can be displayed
# on the benchmark dashboard
client_command="python3 benchmark_serving.py \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--metadata "tensor_parallel_size=$tp" \
$client_args"
echo "Running test case $test_name with qps $qps"
@ -345,6 +348,11 @@ main() {
check_gpus
check_hf_token
# Set to v1 to run v1 benchmark
if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
export VLLM_USE_V1=1
fi
# dependencies
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
(which jq) || (apt-get update && apt-get -y install jq)

View File

@ -1,6 +1,10 @@
#!/bin/sh
TOKEN=$(curl -s -L "https://public.ecr.aws/token?service=public.ecr.aws&scope=repository:q9t5s3a7/vllm-ci-postmerge-repo:pull" | jq -r .token)
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
if [[ "$BUILDKITE_BRANCH" == "main" ]]; then
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-postmerge-repo/manifests/$BUILDKITE_COMMIT"
else
URL="https://public.ecr.aws/v2/q9t5s3a7/vllm-ci-test-repo/manifests/$BUILDKITE_COMMIT"
fi
TIMEOUT_SECONDS=10

View File

@ -29,4 +29,4 @@
"num-iters": 15
}
}
]
]

View File

@ -66,8 +66,7 @@
"swap_space": 16,
"speculative_model": "turboderp/Qwama-0.5B-Instruct",
"num_speculative_tokens": 4,
"speculative_draft_tensor_parallel_size": 1,
"use_v2_block_manager": ""
"speculative_draft_tensor_parallel_size": 1
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",

View File

@ -32,4 +32,4 @@
"backend": "vllm"
}
}
]
]

View File

@ -1,4 +1,15 @@
steps:
- label: "Build wheel - CUDA 12.4"
agents:
queue: cpu_queue_postmerge
commands:
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag vllm-ci:build-image --target build --progress plain ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
- label: "Build wheel - CUDA 12.1"
agents:
queue: cpu_queue_postmerge
@ -37,7 +48,7 @@ steps:
queue: cpu_queue_postmerge
commands:
- "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.1.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.4.0 --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT --target vllm-openai --progress plain ."
- "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
- label: "Build and publish TPU release image"

View File

@ -77,7 +77,6 @@ echo "Commands:$commands"
#ignore certain kernels tests
if [[ $commands == *" kernels "* ]]; then
commands="${commands} \
--ignore=kernels/test_attention.py \
--ignore=kernels/test_attention_selector.py \
--ignore=kernels/test_blocksparse_attention.py \
--ignore=kernels/test_causal_conv1d.py \
@ -92,19 +91,40 @@ if [[ $commands == *" kernels "* ]]; then
--ignore=kernels/test_moe.py \
--ignore=kernels/test_prefix_prefill.py \
--ignore=kernels/test_rand.py \
--ignore=kernels/test_sampler.py"
--ignore=kernels/test_sampler.py \
--ignore=kernels/test_cascade_flash_attn.py \
--ignore=kernels/test_mamba_mixer2.py \
--ignore=kernels/test_aqlm.py \
--ignore=kernels/test_machete_mm.py \
--ignore=kernels/test_mha_attn.py \
--ignore=kernels/test_block_fp8.py \
--ignore=kernels/test_permute_cols.py"
fi
#ignore certain Entrypoints tests
#ignore certain Entrypoints/openai tests
if [[ $commands == *" entrypoints/openai "* ]]; then
commands=${commands//" entrypoints/openai "/" entrypoints/openai \
--ignore=entrypoints/openai/test_accuracy.py \
--ignore=entrypoints/openai/test_audio.py \
--ignore=entrypoints/openai/test_encoder_decoder.py \
--ignore=entrypoints/openai/test_embedding.py \
--ignore=entrypoints/openai/test_oot_registration.py "}
--ignore=entrypoints/openai/test_chat.py \
--ignore=entrypoints/openai/test_shutdown.py \
--ignore=entrypoints/openai/test_completion.py \
--ignore=entrypoints/openai/test_sleep.py \
--ignore=entrypoints/openai/test_models.py \
--ignore=entrypoints/openai/test_prompt_validation.py "}
fi
#ignore certain Entrypoints/llm tests
if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
fi
# --ignore=entrypoints/openai/test_encoder_decoder.py \
# --ignore=entrypoints/openai/test_embedding.py \
# --ignore=entrypoints/openai/test_oot_registration.py
# --ignore=entrypoints/openai/test_accuracy.py \
# --ignore=entrypoints/openai/test_models.py <= Fails on MI250 but passes on MI300 as of 2025-03-13
PARALLEL_JOB_COUNT=8
# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs.
if [[ $commands == *"--shard-id="* ]]; then
@ -121,6 +141,8 @@ if [[ $commands == *"--shard-id="* ]]; then
--rm \
-e HIP_VISIBLE_DEVICES="${GPU}" \
-e HF_TOKEN \
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \
-v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \
--name "${container_name}_${GPU}" \
@ -148,6 +170,8 @@ else
--rm \
-e HIP_VISIBLE_DEVICES=0 \
-e HF_TOKEN \
-e AWS_ACCESS_KEY_ID \
-e AWS_SECRET_ACCESS_KEY \
-v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \
--name "${container_name}" \

View File

@ -19,23 +19,25 @@ remove_docker_container
# Run the image, setting --shm-size=4g for tensor parallel.
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus="$CORE_RANGE" \
--cpuset-mems="$NUMA_NODE" --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
--cpuset-mems="$NUMA_NODE" --privileged=true -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2
function cpu_tests() {
set -e
export NUMA_NODE=$2
export BUILDKITE_BUILD_NUMBER=$3
# offline inference
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-avx2-"$NUMA_NODE" bash -c "
set -e
python3 examples/offline_inference/basic.py"
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m"
# Run basic model test
docker exec cpu-test-"$BUILDKITE_BUILD_NUMBER"-"$NUMA_NODE" bash -c "
set -e
pip install -r vllm/requirements-test.txt
pip install -r vllm/requirements/test.txt
pip install -r vllm/requirements/cpu.txt
pytest -v -s tests/models/decoder_only/language -m cpu_model
pytest -v -s tests/models/embedding/language -m cpu_model
pytest -v -s tests/models/encoder_decoder/language -m cpu_model
@ -85,4 +87,4 @@ function cpu_tests() {
# All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE $BUILDKITE_BUILD_NUMBER"

View File

@ -24,5 +24,5 @@ remove_docker_container
# Run the image and test offline inference
docker run -e HF_TOKEN -v /root/.cache/huggingface:/root/.cache/huggingface --name gh200-test --gpus=all --entrypoint="" gh200-test bash -c '
python3 examples/offline_inference/cli.py --model meta-llama/Llama-3.2-1B
python3 examples/offline_inference/basic/generate.py --model meta-llama/Llama-3.2-1B
'

View File

@ -20,5 +20,5 @@ trap remove_docker_container_and_exit EXIT
remove_docker_container
# Run the image and launch offline inference
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic.py
docker run --runtime=habana --name=hpu-test --network=host -e HABANA_VISIBLE_DEVICES=all -e VLLM_SKIP_WARMUP=true --entrypoint="" hpu-test-env python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
EXITCODE=$?

View File

@ -29,9 +29,6 @@ if [ -f /tmp/neuron-docker-build-timestamp ]; then
docker image prune -f
# Remove unused volumes / force the system prune for old images as well.
docker volume prune -f && docker system prune -f
# Remove huggingface model artifacts and compiler cache
rm -rf "${HF_MOUNT:?}/*"
rm -rf "${NEURON_COMPILE_CACHE_MOUNT:?}/*"
echo "$current_time" > /tmp/neuron-docker-build-timestamp
fi
else
@ -47,11 +44,11 @@ remove_docker_container() {
trap remove_docker_container EXIT
# Run the image
docker run --rm -it --device=/dev/neuron0 --device=/dev/neuron1 --network host \
docker run --rm -it --device=/dev/neuron0 --network bridge \
-v "${HF_CACHE}:${HF_MOUNT}" \
-e "HF_HOME=${HF_MOUNT}" \
-v "${NEURON_COMPILE_CACHE_URL}:${NEURON_COMPILE_CACHE_MOUNT}" \
-e "NEURON_COMPILE_CACHE_URL=${NEURON_COMPILE_CACHE_MOUNT}" \
--name "${container_name}" \
${image_name} \
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/ -v --capture=tee-sys"
/bin/bash -c "python3 /workspace/vllm/examples/offline_inference/neuron.py && python3 -m pytest /workspace/vllm/tests/neuron/1_core/ -v --capture=tee-sys && python3 -m pytest /workspace/vllm/tests/neuron/2_core/ -v --capture=tee-sys"

View File

@ -13,4 +13,4 @@ trap remove_docker_container EXIT
remove_docker_container
# Run the image and launch offline inference
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic.py
docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference/basic/generate.py --model facebook/opt-125m

View File

@ -19,7 +19,6 @@ docker run --privileged --net host --shm-size=16G -it \
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
&& python3 -m pip install pytest \
&& python3 -m pip install lm_eval[api]==0.4.4 \
&& pytest -v -s /workspace/vllm/tests/entrypoints/openai/test_accuracy.py \
&& pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
&& python3 /workspace/vllm/tests/tpu/test_compilation.py \
&& python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \

27
.buildkite/run-tpu-v1-test.sh Executable file
View File

@ -0,0 +1,27 @@
#!/bin/bash
set -e
# Build the docker image.
docker build -f Dockerfile.tpu -t vllm-tpu .
# Set up cleanup.
remove_docker_container() { docker rm -f tpu-test || true; }
trap remove_docker_container EXIT
# Remove the container that might not be cleaned up in the previous run.
remove_docker_container
# For HF_TOKEN.
source /etc/environment
# Run a simple end-to-end example.
docker run --privileged --net host --shm-size=16G -it \
-e "HF_TOKEN=$HF_TOKEN" -e "VLLM_USE_V1=1" --name tpu-test \
vllm-tpu /bin/bash -c "python3 -m pip install git+https://github.com/thuml/depyf.git \
&& python3 -m pip install pytest \
&& python3 -m pip install lm_eval[api]==0.4.4 \
&& pytest -v -s /workspace/vllm/tests/tpu/test_custom_dispatcher.py \
&& pytest -v -s /workspace/vllm/tests/v1/tpu/test_basic.py \
&& pytest -v -s /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine \
&& python3 /workspace/vllm/tests/tpu/test_compilation.py \
&& python3 /workspace/vllm/tests/tpu/test_quantization_accuracy.py \
&& python3 /workspace/vllm/examples/offline_inference/tpu.py"

View File

@ -4,16 +4,27 @@
# It serves a sanity check for compilation and basic model usage.
set -ex
image_name="xpu/vllm-ci:${BUILDKITE_COMMIT}"
container_name="xpu_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
# Try building the docker image
docker build -t xpu-test -f Dockerfile.xpu .
docker build -t ${image_name} -f Dockerfile.xpu .
# Setup cleanup
remove_docker_container() { docker rm -f xpu-test || true; }
remove_docker_container() {
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true;
}
trap remove_docker_container EXIT
remove_docker_container
# Run the image and test offline inference/tensor parallel
docker run --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test sh -c '
python3 examples/offline_inference/basic.py
python3 examples/offline_inference/cli.py -tp 2
docker run \
--device /dev/dri \
-v /dev/dri/by-path:/dev/dri/by-path \
--entrypoint="" \
--name "${container_name}" \
"${image_name}" \
sh -c '
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m
python3 examples/offline_inference/basic/generate.py --model facebook/opt-125m -tp 2
'

View File

@ -2,7 +2,7 @@
# adding a new command to an existing step. See different options here for examples.
# This script will be feed into Jinja template in `test-template-aws.j2` at
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
# https://github.com/vllm-project/buildkite-ci/blob/main/scripts/test-template-aws.j2
# to generate the final pipeline yaml file.
# Documentation
@ -15,7 +15,7 @@
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
# gpu(str): override the GPU selection for the test. default is on L4 GPUs. currently only supports a100
# num_gpus(int): override the number of GPUs for the test. default to 1 GPU. currently support 2,4.
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
# num_nodes(int): whether to simulate multi-node setup by launch multiple containers on one host,
# in this case, commands must be specified. the first command runs on first host, the second
# command runs on the second host.
# working_dir(str): specify the place where command should execute, default to /vllm-workspace/tests
@ -24,8 +24,8 @@
# When adding a test
# - If the test belong to an existing group, add it there
# - If the test is short, add to any existing step
# - If the test takes more than 10min, then it is okay to create a new step.
# Note that all steps execute in parallel.
# - If the test takes more than 10min, then it is okay to create a new step.
# Note that all steps execute in parallel.
steps:
##### fast check tests #####
@ -35,13 +35,12 @@ steps:
fast_check: true
no_gpu: True
commands:
- pip install -r requirements-docs.txt
- pip install -r ../../requirements/docs.txt
- SPHINXOPTS=\"-W\" make html
# Check API reference (if it fails, you may have missing mock imports)
- grep \"sig sig-object py\" build/html/api/inference_params.html
- label: Async Engine, Inputs, Utils, Worker Test # 24min
fast_check: true
source_file_dependencies:
- vllm/
- tests/mq_llm_engine
@ -78,6 +77,7 @@ steps:
- tests/basic_correctness/test_preemption
- tests/basic_correctness/test_cumem.py
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s basic_correctness/test_cumem.py
- pytest -v -s basic_correctness/test_basic_correctness.py
- pytest -v -s basic_correctness/test_cpu_offload.py
@ -107,47 +107,55 @@ steps:
mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/entrypoints/llm
- tests/entrypoints/openai
- tests/entrypoints/test_chat_utils
- tests/entrypoints/offline_mode
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- pytest -v -s entrypoints/openai --ignore=entrypoints/openai/test_oot_registration.py --ignore=entrypoints/openai/correctness/
- pytest -v -s entrypoints/test_chat_utils.py
- pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Distributed Tests (4 GPUs) # 10min
working_dir: "/vllm-workspace/tests"
num_gpus: 4
fast_check: true
source_file_dependencies:
- vllm/distributed/
- vllm/core/
- tests/distributed
- tests/distributed/test_utils
- tests/distributed/test_pynccl
- tests/spec_decode/e2e/test_integration_dist_tp4
- tests/compile
- tests/compile/test_basic_correctness
- examples/offline_inference/rlhf.py
- examples/offline_inference/ray_placement.py
- examples/offline_inference/rlhf_colocate.py
- tests/examples/offline_inference/data_parallel.py
commands:
- python3 ../examples/offline_inference/data_parallel.py
- pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
# TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests
- python3 ../examples/offline_inference/rlhf.py
- RAY_DEDUP_LOGS=0 python3 ../examples/offline_inference/ray_placement.py
- pushd ../examples/offline_inference
- python3 rlhf.py
- RAY_DEDUP_LOGS=0 python3 rlhf_colocate.py
- popd
- label: Metrics, Tracing Test # 10min
num_gpus: 2
fast_check: true
num_gpus: 2
source_file_dependencies:
- vllm/
- tests/metrics
- tests/tracing
commands:
- pytest -v -s metrics
- pytest -v -s metrics
- "pip install \
'opentelemetry-sdk>=1.26.0,<1.27.0' \
'opentelemetry-api>=1.26.0,<1.27.0' \
@ -174,6 +182,9 @@ steps:
- vllm/
- tests/engine
- tests/tokenization
- tests/test_sequence
- tests/test_config
- tests/test_logger
commands:
- pytest -v -s engine test_sequence.py test_config.py test_logger.py
# OOM in the CI unless we run this separately
@ -186,15 +197,22 @@ steps:
- tests/v1
commands:
# split the test to avoid interference
- VLLM_USE_V1=1 pytest -v -s v1/core
- VLLM_USE_V1=1 pytest -v -s v1/engine
- VLLM_USE_V1=1 pytest -v -s v1/sample
- VLLM_USE_V1=1 pytest -v -s v1/worker
- VLLM_USE_V1=1 pytest -v -s v1/test_stats.py
- VLLM_USE_V1=1 pytest -v -s v1/test_utils.py
- pytest -v -s v1/core
- pytest -v -s v1/entrypoints
- pytest -v -s v1/engine
- pytest -v -s v1/entrypoints
- pytest -v -s v1/sample
- pytest -v -s v1/worker
- pytest -v -s v1/structured_output
- pytest -v -s v1/test_stats.py
- pytest -v -s v1/test_utils.py
- pytest -v -s v1/test_oracle.py
# TODO: accuracy does not match, whether setting
# VLLM_USE_FLASHINFER_SAMPLER or not on H100.
- VLLM_USE_V1=1 pytest -v -s v1/e2e
- pytest -v -s v1/e2e
# Integration test for streaming correctness (requires special branch).
- pip install -U git+https://github.com/robertgshaw2-neuralmagic/lm-evaluation-harness.git@streaming-api
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
- label: Examples Test # 25min
working_dir: "/vllm-workspace/examples"
@ -204,19 +222,22 @@ steps:
- examples/
commands:
- pip install tensorizer # for tensorizer test
- python3 offline_inference/basic.py
- python3 offline_inference/cpu_offload.py
- python3 offline_inference/chat.py
- python3 offline_inference/basic/generate.py --model facebook/opt-125m
- python3 offline_inference/basic/generate.py --model meta-llama/Llama-2-13b-chat-hf --cpu-offload-gb 10
- python3 offline_inference/basic/chat.py
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py
- python3 offline_inference/vision_language.py
- python3 offline_inference/vision_language_multi_image.py
- python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/audio_language.py --seed 0
- python3 offline_inference/vision_language.py --seed 0
- python3 offline_inference/vision_language_embedding.py --seed 0
- python3 offline_inference/vision_language_multi_image.py --seed 0
- VLLM_USE_V1=0 python3 other/tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 other/tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference/encoder_decoder.py
- python3 offline_inference/classification.py
- python3 offline_inference/embedding.py
- python3 offline_inference/scoring.py
- python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
- python3 offline_inference/basic/classify.py
- python3 offline_inference/basic/embed.py
- python3 offline_inference/basic/score.py
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
- label: Prefix Caching Test # 9min
mirror_hardwares: [amd]
@ -243,7 +264,7 @@ steps:
- vllm/model_executor/guided_decoding
- tests/test_logits_processor
- tests/model_executor/test_guided_processors
commands:
commands:
- pytest -v -s test_logits_processor.py
- pytest -v -s model_executor/test_guided_processors.py
@ -254,7 +275,7 @@ steps:
- vllm/model_executor/models/eagle.py
commands:
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py --ignore=spec_decode/e2e/test_mtp_correctness.py
- pytest -v -s spec_decode/e2e/test_eagle_correctness.py
- label: LoRA Test %N # 15min each
@ -262,11 +283,10 @@ steps:
source_file_dependencies:
- vllm/lora
- tests/lora
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py --ignore=lora/test_chatglm3_tp.py --ignore=lora/test_llama_tp.py --ignore=lora/test_minicpmv_tp.py --ignore=lora/test_transfomers_model.py
parallelism: 4
- label: "PyTorch Fullgraph Smoke Test" # 9min
fast_check: true
- label: PyTorch Fullgraph Smoke Test # 9min
source_file_dependencies:
- vllm/
- tests/compile
@ -276,7 +296,7 @@ steps:
- pytest -v -s compile/piecewise/test_simple.py
- pytest -v -s compile/piecewise/test_toy_llama.py
- label: "PyTorch Fullgraph Test" # 18min
- label: PyTorch Fullgraph Test # 18min
source_file_dependencies:
- vllm/
- tests/compile
@ -328,6 +348,14 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- bash ./run-tests.sh -c configs/models-small.txt -t 1
- label: OpenAI API correctness
source_file_dependencies:
- csrc/
- vllm/entrypoints/openai/
- vllm/model_executor/models/whisper.py
commands: # LMEval+Transcription WER check
- pytest -s entrypoints/openai/correctness/
- label: Encoder Decoder tests # 5min
source_file_dependencies:
- vllm/
@ -353,7 +381,8 @@ steps:
commands:
- pytest -v -s models/test_transformers.py
- pytest -v -s models/test_registry.py
- pytest -v -s models/test_initialization.py
# V1 Test: https://github.com/vllm-project/vllm/issues/14531
- VLLM_USE_V1=0 pytest -v -s models/test_initialization.py
- label: Language Models Test (Standard) # 32min
#mirror_hardwares: [amd]
@ -482,6 +511,7 @@ steps:
- entrypoints/llm/test_collective_rpc.py
commands:
- pytest -v -s entrypoints/llm/test_collective_rpc.py
- VLLM_USE_V1=1 torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
- torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
- pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
@ -495,13 +525,12 @@ steps:
# this test fails consistently.
# TODO: investigate and fix
# - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/disagg_test.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
- label: Plugin Tests (2 GPUs) # 40min
working_dir: "/vllm-workspace/tests"
num_gpus: 2
fast_check: true
source_file_dependencies:
- vllm/plugins/
- tests/plugins/
@ -512,6 +541,7 @@ steps:
- pip uninstall vllm_add_dummy_platform -y
# end platform plugin tests
# other tests continue here:
- pytest -v -s plugins_tests/test_scheduler_plugins.py
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s distributed/test_distributed_oot.py
- pytest -v -s entrypoints/openai/test_oot_registration.py # it needs a clean process
@ -561,11 +591,12 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
# This test runs llama 13B, so it is required to run on 4 GPUs.
- pytest -v -s -x lora/test_long_context.py
# There is some Tensor Parallelism related processing logic in LoRA that
# There is some Tensor Parallelism related processing logic in LoRA that
# requires multi-GPU testing for validation.
- pytest -v -s -x lora/test_chatglm3_tp.py
- pytest -v -s -x lora/test_llama_tp.py
- pytest -v -s -x lora/test_minicpmv_tp.py
- pytest -v -s -x lora/test_transfomers_model.py
- label: Weight Loading Multiple GPU Test # 33min
@ -586,7 +617,7 @@ steps:
- vllm/
- tests/weight_loading
commands:
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
- bash weight_loading/run_model_weight_loading_test.sh -c weight_loading/models-large.txt
##### multi gpus test #####
@ -598,7 +629,7 @@ steps:
num_gpus: 4
source_file_dependencies:
- vllm/
commands:
commands:
# NOTE: don't test llama model here, it seems hf implementation is buggy
# see https://github.com/vllm-project/vllm/pull/5689 for details
- pytest -v -s distributed/test_custom_all_reduce.py

View File

@ -50,8 +50,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/$BUILDKITE_COMMIT/"
if [[ $normal_wheel == *"cu118"* ]]; then
# if $normal_wheel matches cu118, do not upload the index.html
echo "Skipping index files for cu118 wheels"
elif [[ $normal_wheel == *"cu121"* ]]; then
# if $normal_wheel matches cu121, do not upload the index.html
echo "Skipping index files for cu121 wheels"
else
# only upload index.html for cu12 wheels (default wheels)
# only upload index.html for cu124 wheels (default wheels)
aws s3 cp index.html "s3://vllm-wheels/$BUILDKITE_COMMIT/vllm/index.html"
aws s3 cp "s3://vllm-wheels/nightly/index.html" "s3://vllm-wheels/$BUILDKITE_COMMIT/index.html"
fi
@ -63,8 +66,11 @@ aws s3 cp "$normal_wheel" "s3://vllm-wheels/nightly/"
if [[ $normal_wheel == *"cu118"* ]]; then
# if $normal_wheel matches cu118, do not upload the index.html
echo "Skipping index files for cu118 wheels"
elif [[ $normal_wheel == *"cu121"* ]]; then
# if $normal_wheel matches cu121, do not upload the index.html
echo "Skipping index files for cu121 wheels"
else
# only upload index.html for cu12 wheels (default wheels)
# only upload index.html for cu124 wheels (default wheels)
aws s3 cp index.html "s3://vllm-wheels/nightly/vllm/index.html"
fi

27
.github/CODEOWNERS vendored
View File

@ -10,27 +10,32 @@
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
/vllm/model_executor/guided_decoding @mgoin
/vllm/model_executor/guided_decoding @mgoin @russellb
/vllm/multimodal @DarkLight1337 @ywang96
CMakeLists.txt @tlrmchlsmth
# vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
/vllm/v1/structured_output @mgoin @russellb
# Test ownership
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
/tests/models @DarkLight1337 @ywang96
/tests/multimodal @DarkLight1337 @ywang96
/tests/prefix_caching @comaniac @KuntaiDu
/tests/spec_decode @njhill @LiuXiaoxuanPKU
/tests/kernels @tlrmchlsmth @WoosukKwon
/tests/quantization @mgoin @robertgshaw2-redhat
/.buildkite/lm-eval-harness @mgoin @simon-mo
/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
/tests/distributed/test_multi_node_assignment.py @youkaichao
/tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb
/tests/kernels @tlrmchlsmth @WoosukKwon
/tests/model_executor/test_guided_processors.py @mgoin @russellb
/tests/models @DarkLight1337 @ywang96
/tests/multi_step @alexm-redhat @comaniac
/tests/multimodal @DarkLight1337 @ywang96
/tests/prefix_caching @comaniac @KuntaiDu
/tests/quantization @mgoin @robertgshaw2-redhat
/tests/spec_decode @njhill @LiuXiaoxuanPKU
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb
/tests/v1/structured_output @mgoin @russellb
/tests/weight_loading @mgoin @youkaichao
/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac

View File

@ -2,4 +2,5 @@ FILL IN THE PR DESCRIPTION HERE
FIX #xxxx (*link existing issues this PR will resolve*)
**BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing/overview.html **
<!--- pyml disable-next-line no-emphasis-as-heading -->
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing/overview.html>**

View File

@ -23,7 +23,7 @@ updates:
- dependency-name: "lm-format-enforcer"
- dependency-name: "gguf"
- dependency-name: "compressed-tensors"
- dependency-name: "ray[adag]"
- dependency-name: "ray[cgraph]" # Ray Compiled Graph
- dependency-name: "lm-eval"
groups:
minor-update:

16
.github/mergify.yml vendored
View File

@ -5,6 +5,7 @@ pull_request_rules:
- or:
- files~=^[^/]+\.md$
- files~=^docs/
- files~=^examples/
actions:
label:
add:
@ -35,6 +36,21 @@ pull_request_rules:
add:
- frontend
- name: label-multi-modality
description: Automatically apply multi-modality label
conditions:
- or:
- files~=^vllm/multimodal/
- files~=^tests/multimodal/
- files~=^tests/models/multimodal/
- files~=^tests/models/*/audio_language/
- files~=^tests/models/*/vision_language/
- files=tests/models/test_vision.py
actions:
label:
add:
- multi-modality
- name: label-structured-output
description: Automatically apply structured-output label
conditions:

View File

@ -16,7 +16,7 @@ jobs:
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- name: Set up Python
uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
with:
python-version: '3.12'

View File

@ -12,17 +12,17 @@ jobs:
fetch-depth: 0
- name: Set up Helm
uses: azure/setup-helm@fe7b79cd5ee1e45176fcad797de68ecaf3ca4814 # v4.2.0
uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0
with:
version: v3.14.4
#Python is required because ct lint runs Yamale and yamllint which require Python.
- uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
with:
python-version: '3.13'
- name: Set up chart-testing
uses: helm/chart-testing-action@e6669bcd63d7cb57cb4380c33043eebe5d111992 # v2.6.1
uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b # v2.7.0
with:
version: v3.10.1
@ -47,7 +47,7 @@ jobs:
aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
- name: Create kind cluster
uses: helm/kind-action@0025e74a8c7512023d06dc019c617aa3cf561fde # v1.10.0
uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
- name: Build the Docker image vllm cpu
run: docker buildx build -f Dockerfile.cpu -t vllm-cpu-env .

View File

@ -10,10 +10,11 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
- uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
- uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
with:
python-version: "3.12"
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
- run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
with:
extra_args: --all-files --hook-stage manual

View File

@ -39,7 +39,7 @@ jobs:
const script = require('.github/workflows/scripts/create_release.js')
await script(github, context, core)
# NOTE(simon): No longer build wheel using Github Actions. See buildkite's release workflow.
# NOTE(simon): No longer build wheel using GitHub Actions. See buildkite's release workflow.
# wheel:
# name: Build Wheel
# runs-on: ${{ matrix.os }}
@ -50,7 +50,7 @@ jobs:
# matrix:
# os: ['ubuntu-20.04']
# python-version: ['3.9', '3.10', '3.11', '3.12']
# pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements-cuda.txt.
# pytorch-version: ['2.4.0'] # Must be the most recent version that meets requirements/cuda.txt.
# cuda-version: ['11.8', '12.1']
# steps:

View File

@ -9,7 +9,7 @@ PATH=${cuda_home}/bin:$PATH
LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
# Install requirements
$python_executable -m pip install -r requirements-build.txt -r requirements-cuda.txt
$python_executable -m pip install -r requirements/build.txt -r requirements/cuda.txt
# Limit the number of parallel jobs to avoid OOM
export MAX_JOBS=1

View File

@ -1,4 +1,4 @@
// Uses Github's API to create the release and wait for result.
// Uses GitHub's API to create the release and wait for result.
// We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
module.exports = async (github, context, core) => {

View File

@ -13,7 +13,7 @@ jobs:
actions: write
runs-on: ubuntu-latest
steps:
- uses: actions/stale@28ca1036281a5e5922ead5184a1bbf96e5fc984e # v9.0.0
- uses: actions/stale@5bef64f19d7facfb25b37b414482c7164d639639 # v9.1.0
with:
# Increasing this value ensures that changes to this workflow
# propagate to all issues and PRs in days rather than months

2
.gitignore vendored
View File

@ -197,7 +197,7 @@ _build/
hip_compat.h
# Benchmark dataset
benchmarks/*.json
benchmarks/**/*.json
# Linting
actionlint

View File

@ -1,6 +1,7 @@
default_stages:
- pre-commit # Run locally
- manual # Run in CI
exclude: 'vllm/third_party/.*'
repos:
- repo: https://github.com/google/yapf
rev: v0.43.0
@ -12,32 +13,39 @@ repos:
rev: v0.9.3
hooks:
- id: ruff
args: [--output-format, github]
args: [--output-format, github, --fix]
- repo: https://github.com/codespell-project/codespell
rev: v2.4.0
hooks:
- id: codespell
exclude: 'benchmarks/sonnet.txt|(build|tests/(lora/data|models/fixtures|prompts))/.*'
additional_dependencies: ['tomli']
args: ['--toml', 'pyproject.toml']
- repo: https://github.com/PyCQA/isort
rev: 5.13.2
rev: 0a0b7a830386ba6a31c2ec8316849ae4d1b8240d # 6.0.0
hooks:
- id: isort
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v19.1.7
hooks:
- id: clang-format
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))'
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
types_or: [c++, cuda]
args: [--style=file, --verbose]
- repo: https://github.com/jackdewinter/pymarkdown
rev: v0.9.27
hooks:
- id: pymarkdown
files: docs/.*
args: [fix]
- repo: https://github.com/rhysd/actionlint
rev: v1.7.7
hooks:
- id: actionlint
- repo: https://github.com/astral-sh/uv-pre-commit
rev: 0.6.2
hooks:
- id: pip-compile
args: [requirements/test.in, -o, requirements/test.txt]
files: ^requirements/test\.(in|txt)$
- repo: local
hooks:
- id: mypy-local
@ -45,7 +53,7 @@ repos:
entry: tools/mypy.sh 0 "local"
language: python
types: [python]
additional_dependencies: &mypy_deps [mypy==1.11.1, types-setuptools, types-PyYAML, types-requests]
additional_dependencies: &mypy_deps [mypy==1.11.1, types-cachetools, types-setuptools, types-PyYAML, types-requests]
stages: [pre-commit] # Don't run in CI
- id: mypy-3.9 # TODO: Use https://github.com/pre-commit/mirrors-mypy when mypy setup is less awkward
name: Run mypy for Python 3.9
@ -102,9 +110,20 @@ repos:
entry: python tools/check_spdx_header.py
language: python
types: [python]
- id: check-filenames
name: Check for spaces in all filenames
entry: bash
args:
- -c
- 'git ls-files | grep " " && echo "Filenames should not contain spaces!" && exit 1 || exit 0'
language: system
always_run: true
pass_filenames: false
# Keep `suggestion` last
- id: suggestion
name: Suggestion
entry: bash -c 'echo "To bypass pre-commit hooks, add --no-verify to git commit."'
language: system
verbose: true
pass_filenames: false
# Insert new entries above the `suggestion` entry

View File

@ -18,4 +18,4 @@ formats: []
# Optionally declare the Python requirements required to build your docs
python:
install:
- requirements: docs/requirements-docs.txt
- requirements: requirements/docs.txt

239
CMakeLists.txt Executable file → Normal file
View File

@ -31,10 +31,10 @@ set(ignoreMe "${VLLM_PYTHON_PATH}")
set(PYTHON_SUPPORTED_VERSIONS "3.9" "3.10" "3.11" "3.12")
# Supported NVIDIA architectures.
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0")
set(CUDA_SUPPORTED_ARCHS "7.0;7.2;7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0")
# Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101")
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx942;gfx1030;gfx1100;gfx1101")
#
# Supported/expected torch versions for CUDA/ROCm.
@ -46,8 +46,8 @@ set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx11
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from Dockerfile.rocm
#
set(TORCH_SUPPORTED_VERSION_CUDA "2.5.1")
set(TORCH_SUPPORTED_VERSION_ROCM "2.5.1")
set(TORCH_SUPPORTED_VERSION_CUDA "2.6.0")
set(TORCH_SUPPORTED_VERSION_ROCM "2.6.0")
#
# Try to find python package with an executable that exactly matches
@ -174,6 +174,25 @@ include(FetchContent)
file(MAKE_DIRECTORY ${FETCHCONTENT_BASE_DIR}) # Ensure the directory exists
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
#
# Set rocm version dev int.
#
if(VLLM_GPU_LANG STREQUAL "HIP")
#
# Overriding the default -O set up by cmake, adding ggdb3 for the most verbose devug info
#
set(CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG "${CMAKE_${VLLM_GPU_LANG}_FLAGS_DEBUG} -O0 -ggdb3")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -ggdb3")
#
# Certain HIP functions are marked as [[nodiscard]], yet vllm ignores the result which generates
# a lot of warnings that always mask real issues. Suppressing until this is properly addressed.
#
set(CMAKE_${VLLM_GPU_LANG}_FLAGS "${CMAKE_${VLLM_GPU_LANG}_FLAGS} -Wno-unused-result")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-result")
endif()
#
# Define other extension targets
#
@ -192,7 +211,7 @@ set_gencode_flags_for_srcs(
if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Enabling cumem allocator extension.")
# link against cuda driver library
list(APPEND CUMEM_LIBS cuda)
list(APPEND CUMEM_LIBS CUDA::cuda_driver)
define_gpu_extension_target(
cumem_allocator
DESTINATION vllm
@ -228,7 +247,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
# Set CUTLASS_REVISION manually -- its revision detection doesn't work in this case.
set(CUTLASS_REVISION "v3.6.0" CACHE STRING "CUTLASS revision to use")
# Please keep this in sync with FetchContent_Declare line below.
set(CUTLASS_REVISION "v3.8.0" CACHE STRING "CUTLASS revision to use")
# Use the specified CUTLASS source directory for compilation if VLLM_CUTLASS_SRC_DIR is provided
if (DEFINED ENV{VLLM_CUTLASS_SRC_DIR})
@ -245,7 +265,8 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
FetchContent_Declare(
cutlass
GIT_REPOSITORY https://github.com/nvidia/cutlass.git
GIT_TAG v3.7.0
# Please keep this in sync with CUTLASS_REVISION line above.
GIT_TAG v3.8.0
GIT_PROGRESS TRUE
# Speed up CUTLASS download by retrieving only the specified GIT_TAG instead of the history.
@ -264,8 +285,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
"csrc/custom_all_reduce.cu"
"csrc/permute_cols.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
"csrc/quantization/fp4/nvfp4_quant_entry.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_entry.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_entry.cu"
"csrc/sparse/cutlass/sparse_compressor_entry.cu"
"csrc/cutlass_extensions/common.cpp")
set_gencode_flags_for_srcs(
@ -275,7 +297,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# Only build Marlin kernels if we are building for at least some compatible archs.
# Keep building Marlin for 9.0 as there are some group sizes and shapes that
# are not supported by Machete yet.
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
cuda_archs_loose_intersection(MARLIN_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
if (MARLIN_ARCHS)
set(MARLIN_SRCS
"csrc/quantization/fp8/fp8_marlin.cu"
@ -295,43 +317,87 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
" in CUDA target architectures")
endif()
# Only build AllSpark kernels if we are building for at least some compatible archs.
cuda_archs_loose_intersection(ALLSPARK_ARCHS "8.0;8.6;8.7;8.9" "${CUDA_ARCHS}")
if (ALLSPARK_ARCHS)
set(ALLSPARK_SRCS
"csrc/quantization/gptq_allspark/allspark_repack.cu"
"csrc/quantization/gptq_allspark/allspark_qgemm_w8a16.cu")
set_gencode_flags_for_srcs(
SRCS "${ALLSPARK_SRCS}"
CUDA_ARCHS "${ALLSPARK_ARCHS}")
list(APPEND VLLM_EXT_SRC "${ALLSPARK_SRCS}")
message(STATUS "Building AllSpark kernels for archs: ${ALLSPARK_ARCHS}")
else()
message(STATUS "Not building AllSpark kernels as no compatible archs found"
" in CUDA target architectures")
endif()
set(SCALED_MM_3X_ARCHS)
# The cutlass_scaled_mm kernels for Hopper (c3x, i.e. CUTLASS 3.x) require
# CUDA 12.0 or later (and only work on Hopper, 9.0a for now).
cuda_archs_loose_intersection(SCALED_MM_3X_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu"
# CUDA 12.0 or later
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm90.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_fp8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm90_int8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_azp_sm90_int8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm90_fp8.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
CUDA_ARCHS "${SCALED_MM_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_C3X=1")
message(STATUS "Building scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM90=1")
# Let scaled_mm_c2x know it doesn't need to build these arches
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
message(STATUS "Building scaled_mm_c3x_sm90 for archs: ${SCALED_MM_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_3X_ARCHS)
message(STATUS "Not building scaled_mm_c3x as CUDA Compiler version is "
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0 AND SCALED_MM_ARCHS)
message(STATUS "Not building scaled_mm_c3x_sm90 as CUDA Compiler version is "
"not >= 12.0, we recommend upgrading to CUDA 12.0 or "
"later if you intend on running FP8 quantized models on "
"Hopper.")
else()
message(STATUS "Not building scaled_mm_c3x as no compatible archs found "
message(STATUS "Not building scaled_mm_c3x_sm90 as no compatible archs found "
"in CUDA target architectures")
endif()
endif()
# clear SCALED_MM_3X_ARCHS so the scaled_mm_c2x kernels know we didn't
# build any 3x kernels
set(SCALED_MM_3X_ARCHS)
# The cutlass_scaled_mm kernels for Blackwell (c3x, i.e. CUTLASS 3.x) require
# CUDA 12.8 or later
cuda_archs_loose_intersection(SCALED_MM_ARCHS "10.0a;10.1a;12.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm100.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm100_fp8.cu"
)
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SCALED_MM_SM100=1")
# Let scaled_mm_c2x know it doesn't need to build these arches
list(APPEND SCALED_MM_3X_ARCHS "${SCALED_MM_ARCHS}")
message(STATUS "Building scaled_mm_c3x_sm100 for archs: ${SCALED_MM_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND SCALED_MM_ARCHS)
message(STATUS "Not building scaled_mm_c3x_sm100 as CUDA Compiler version is "
"not >= 12.8, we recommend upgrading to CUDA 12.8 or "
"later if you intend on running FP8 quantized models on "
"Blackwell.")
else()
message(STATUS "Not building scaled_mm_c3x_100 as no compatible archs found "
"in CUDA target architectures")
endif()
endif()
#
# For the cutlass_scaled_mm kernels we want to build the c2x (CUTLASS 2.x)
# kernels for the remaining archs that are not already built for 3x.
cuda_archs_loose_intersection(SCALED_MM_2X_ARCHS
"7.5;8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
"7.5;8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
# subtract out the archs that are already built for 3x
list(REMOVE_ITEM SCALED_MM_2X_ARCHS ${SCALED_MM_3X_ARCHS})
if (SCALED_MM_2X_ARCHS)
@ -356,18 +422,18 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
# 2:4 Sparse Kernels
# The 2:4 sparse kernels cutlass_scaled_sparse_mm and cutlass_compressor
# require CUDA 12.2 or later (and only work on Hopper, 9.0a for now).
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
set(SRCS "csrc/sparse/cutlass/sparse_compressor_c3x.cu"
"csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
# require CUDA 12.2 or later (and only work on Hopper).
cuda_archs_loose_intersection(SCALED_MM_ARCHS "9.0a;" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
set(SRCS "csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${SCALED_MM_3X_ARCHS}")
CUDA_ARCHS "${SCALED_MM_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_SPARSE_SCALED_MM_C3X=1")
message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_3X_ARCHS}")
message(STATUS "Building sparse_scaled_mm_c3x for archs: ${SCALED_MM_ARCHS}")
else()
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_3X_ARCHS)
if (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.2 AND SCALED_MM_ARCHS)
message(STATUS "Not building sparse_scaled_mm_c3x kernels as CUDA Compiler version is "
"not >= 12.2, we recommend upgrading to CUDA 12.2 or later "
"if you intend on running FP8 sparse quantized models on Hopper.")
@ -377,6 +443,23 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
endif()
# FP4 Archs and flags
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND FP4_ARCHS)
set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_kernels.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${FP4_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
else()
message(STATUS "Not building NVFP4 as no compatible archs were found.")
# clear FP4_ARCHS
set(FP4_ARCHS)
endif()
#
# Machete kernels
@ -458,7 +541,8 @@ define_gpu_extension_target(
SOURCES ${VLLM_EXT_SRC}
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR}
INCLUDE_DIRECTORIES ${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
USE_SABI 3
WITH_SOABI)
@ -477,12 +561,24 @@ set(VLLM_MOE_EXT_SRC
"csrc/moe/moe_align_sum_kernels.cu"
"csrc/moe/topk_softmax_kernels.cu")
if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_MOE_EXT_SRC "csrc/moe/moe_wna16.cu")
endif()
set_gencode_flags_for_srcs(
SRCS "${VLLM_MOE_EXT_SRC}"
CUDA_ARCHS "${CUDA_ARCHS}")
if(VLLM_GPU_LANG STREQUAL "CUDA")
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0" "${CUDA_ARCHS}")
set(VLLM_MOE_WNA16_SRC
"csrc/moe/moe_wna16.cu")
set_gencode_flags_for_srcs(
SRCS "${VLLM_MOE_WNA16_SRC}"
CUDA_ARCHS "${CUDA_ARCHS}")
list(APPEND VLLM_MOE_EXT_SRC "${VLLM_MOE_WNA16_SRC}")
cuda_archs_loose_intersection(MARLIN_MOE_ARCHS "8.0;8.6;8.7;8.9;9.0;10.0;10.1;12.0" "${CUDA_ARCHS}")
if (MARLIN_MOE_ARCHS)
set(MARLIN_MOE_SRC
"csrc/moe/marlin_kernels/marlin_moe_kernel.h"
@ -536,77 +632,8 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
WITH_SOABI)
endif()
# vllm-flash-attn currently only supported on CUDA
if (NOT VLLM_GPU_LANG STREQUAL "CUDA")
return()
# For CUDA we also build and ship some external projects.
if (VLLM_GPU_LANG STREQUAL "CUDA")
include(cmake/external_projects/flashmla.cmake)
include(cmake/external_projects/vllm_flash_attn.cmake)
endif ()
# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
# arches in the CUDA case (and instead set the gencodes on a per file basis)
# we need to manually set VLLM_GPU_ARCHES here.
if(VLLM_GPU_LANG STREQUAL "CUDA")
foreach(_ARCH ${CUDA_ARCHS})
string(REPLACE "." "" _ARCH "${_ARCH}")
list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
endforeach()
endif()
#
# Build vLLM flash attention from source
#
# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
# They should be identical but if they aren't, this is a massive footgun.
#
# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3).
# If no component is specified, vllm-flash-attn is still installed.
# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
# This is to enable local development of vllm-flash-attn within vLLM.
# It can be set as an environment variable or passed as a cmake argument.
# The environment variable takes precedence.
if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
endif()
if(VLLM_FLASH_ATTN_SRC_DIR)
FetchContent_Declare(
vllm-flash-attn SOURCE_DIR
${VLLM_FLASH_ATTN_SRC_DIR}
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
)
else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG d4e09037abf588af1ec47d0e966b237ee376876c
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
)
endif()
# Fetch the vllm-flash-attn library
FetchContent_MakeAvailable(vllm-flash-attn)
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
# case only one is built, in the case both are built redundant work is done)
install(
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
DESTINATION vllm_flash_attn
COMPONENT _vllm_fa2_C
FILES_MATCHING PATTERN "*.py"
)
install(
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
DESTINATION vllm_flash_attn
COMPONENT _vllm_fa3_C
FILES_MATCHING PATTERN "*.py"
)
# Nothing after vllm-flash-attn, see comment about macros above

View File

@ -125,4 +125,3 @@ Community Impact Guidelines were inspired by
For answers to common questions about this code of conduct, see the
[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at
[Contributor Covenant translations](https://www.contributor-covenant.org/translations).

View File

@ -14,19 +14,21 @@ ARG PYTHON_VERSION=3.12
ARG TARGETPLATFORM
ENV DEBIAN_FRONTEND=noninteractive
# Install Python and other dependencies
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl sudo \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version
# Install minimal dependencies and uv
RUN apt-get update -y \
&& apt-get install -y ccache git curl wget sudo \
&& curl -LsSf https://astral.sh/uv/install.sh | sh
# Add uv to PATH
ENV PATH="/root/.local/bin:$PATH"
# Create venv with specified Python and activate by placing at the front of path
ENV VIRTUAL_ENV="/opt/venv"
RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
# as it was causing spam when compiling the CUTLASS kernels
@ -44,21 +46,19 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
WORKDIR /workspace
# install build and runtime dependencies
# arm64 (GH200) build follows the practice of "use existing pytorch" build,
# we need to install torch and torchvision from the nightly builds first,
# pytorch will not appear as a vLLM dependency in all of the following steps
# after this step
RUN --mount=type=cache,target=/root/.cache/pip \
RUN --mount=type=cache,target=/root/.cache/uv \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121"; \
uv pip install --index-url https://download.pytorch.org/whl/nightly/cu126 "torch==2.7.0.dev20250121+cu126" "torchvision==0.22.0.dev20250121"; \
fi
COPY requirements-common.txt requirements-common.txt
COPY requirements-cuda.txt requirements-cuda.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-cuda.txt
COPY requirements/common.txt requirements/common.txt
COPY requirements/cuda.txt requirements/cuda.txt
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -r requirements/cuda.txt
# cuda arch list used by torch
# can be useful for both `dev` and `test`
@ -76,15 +76,19 @@ FROM base AS build
ARG TARGETPLATFORM
# install build dependencies
COPY requirements-build.txt requirements-build.txt
COPY requirements/build.txt requirements/build.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-build.txt
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -r requirements/build.txt
COPY . .
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
# max jobs used by Ninja to build extensions
ARG max_jobs=2
@ -98,7 +102,7 @@ ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
ARG SCCACHE_REGION_NAME=us-west-2
ARG SCCACHE_S3_NO_CREDENTIALS=0
# if USE_SCCACHE is set, use sccache to speed up compilation
RUN --mount=type=cache,target=/root/.cache/pip \
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=.git,target=.git \
if [ "$USE_SCCACHE" = "1" ]; then \
echo "Installing sccache..." \
@ -118,9 +122,12 @@ RUN --mount=type=cache,target=/root/.cache/pip \
ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=.git,target=.git \
if [ "$USE_SCCACHE" != "1" ]; then \
# Clean any existing CMake artifacts
rm -rf .deps && \
mkdir -p .deps && \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
fi
@ -140,11 +147,15 @@ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
#################### DEV IMAGE ####################
FROM base as dev
COPY requirements-lint.txt requirements-lint.txt
COPY requirements-test.txt requirements-test.txt
COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-dev.txt
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500
COPY requirements/lint.txt requirements/lint.txt
COPY requirements/test.txt requirements/test.txt
COPY requirements/dev.txt requirements/dev.txt
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -r requirements/dev.txt
#################### DEV IMAGE ####################
#################### vLLM installation IMAGE ####################
@ -160,20 +171,22 @@ ARG TARGETPLATFORM
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
# Install Python and other dependencies
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version
# Install minimal dependencies and uv
RUN apt-get update -y \
&& apt-get install -y ccache git curl wget sudo vim \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 libibverbs-dev \
&& curl -LsSf https://astral.sh/uv/install.sh | sh
# Add uv to PATH
ENV PATH="/root/.local/bin:$PATH"
# Create venv with specified Python and activate by placing at the front of path
ENV VIRTUAL_ENV="/opt/venv"
RUN uv venv --python ${PYTHON_VERSION} --seed ${VIRTUAL_ENV}
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500
# Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
@ -185,29 +198,31 @@ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
# we need to install torch and torchvision from the nightly builds first,
# pytorch will not appear as a vLLM dependency in all of the following steps
# after this step
RUN --mount=type=cache,target=/root/.cache/pip \
RUN --mount=type=cache,target=/root/.cache/uv \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
python3 -m pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \
uv pip install --index-url https://download.pytorch.org/whl/nightly/cu124 "torch==2.6.0.dev20241210+cu124" "torchvision==0.22.0.dev20241215"; \
fi
# Install vllm wheel first, so that torch etc will be installed.
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/pip \
python3 -m pip install dist/*.whl --verbose
--mount=type=cache,target=/root/.cache/uv \
uv pip install dist/*.whl --verbose
# How to build this FlashInfer wheel:
# If we need to build FlashInfer wheel before its release:
# $ export FLASHINFER_ENABLE_AOT=1
# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.6 8.9 9.0+PTX'
# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
# $ cd flashinfer
# $ git checkout 524304395bd1d8cd7d07db083859523fcaa246a4
# $ rm -rf build
# $ python3 setup.py bdist_wheel --dist-dir=dist --verbose
# $ ls dist
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.1.post1+cu124torch2.5-cp38-abi3-linux_x86_64.whl
RUN --mount=type=cache,target=/root/.cache/pip \
. /etc/environment && \
RUN --mount=type=cache,target=/root/.cache/uv \
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
python3 -m pip install https://wheels.vllm.ai/flashinfer/524304395bd1d8cd7d07db083859523fcaa246a4/flashinfer_python-0.2.0.post1-cp${PYTHON_VERSION_STR}-cp${PYTHON_VERSION_STR}-linux_x86_64.whl; \
uv pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
fi
COPY examples examples
@ -215,9 +230,9 @@ COPY examples examples
# some issues w.r.t. JIT compilation. Therefore we need to
# install build dependencies for JIT compilation.
# TODO: Remove this once FlashInfer AOT wheel is fixed
COPY requirements-build.txt requirements-build.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-build.txt
COPY requirements/build.txt requirements/build.txt
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -r requirements/build.txt
#################### vLLM installation IMAGE ####################
@ -228,17 +243,21 @@ FROM vllm-base AS test
ADD . /vllm-workspace/
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-dev.txt
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -e tests/vllm_test_utils
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -r requirements/dev.txt
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -e tests/vllm_test_utils
# enable fast downloads from hf (for testing)
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install hf_transfer
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install hf_transfer
ENV HF_HUB_ENABLE_HF_TRANSFER 1
# Copy in the v1 package for testing (it isn't distributed yet)
@ -256,12 +275,16 @@ RUN mv vllm test_docs/
# base openai image with additional requirements, for any subsequent openai-style images
FROM vllm-base AS vllm-openai-base
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500
# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
RUN --mount=type=cache,target=/root/.cache/uv \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.42.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
else \
pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
uv pip install accelerate hf_transfer 'modelscope!=1.15.0' 'bitsandbytes>=0.45.0' 'timm==0.9.10' boto3 runai-model-streamer runai-model-streamer[s3]; \
fi
ENV VLLM_USAGE_SOURCE production-docker-image

View File

@ -26,18 +26,18 @@ WORKDIR /workspace
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
--mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
pip install --upgrade pip && \
pip install -r requirements-build.txt
pip install -r requirements/build.txt
FROM cpu-test-arm AS build
WORKDIR /workspace/vllm
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
--mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
pip install -v -r requirements-cpu.txt
--mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
--mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
pip install -v -r requirements/cpu.txt
COPY . .
ARG GIT_REPO_CHECK=0

View File

@ -22,25 +22,25 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li
RUN echo 'ulimit -c 0' >> ~/.bashrc
RUN pip install intel_extension_for_pytorch==2.5.0
RUN pip install intel_extension_for_pytorch==2.6.0
WORKDIR /workspace
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
--mount=type=bind,src=requirements/build.txt,target=requirements/build.txt \
pip install --upgrade pip && \
pip install -r requirements-build.txt
pip install -r requirements/build.txt
FROM cpu-test-1 AS build
WORKDIR /workspace/vllm
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
--mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
pip install -v -r requirements-cpu.txt
--mount=type=bind,src=requirements/common.txt,target=requirements/common.txt \
--mount=type=bind,src=requirements/cpu.txt,target=requirements/cpu.txt \
pip install -v -r requirements/cpu.txt
COPY . .
ARG GIT_REPO_CHECK=0

View File

@ -4,7 +4,7 @@ COPY ./ /workspace/vllm
WORKDIR /workspace/vllm
RUN pip install -v -r requirements-hpu.txt
RUN pip install -v -r requirements/hpu.txt
ENV no_proxy=localhost,127.0.0.1
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true

View File

@ -23,10 +23,12 @@ WORKDIR ${APP_MOUNT}/vllm
RUN python3 -m pip install --upgrade pip
RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
RUN python3 -m pip install sentencepiece transformers==4.45.2 -U
RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install neuronx-cc==2.16.345.0 --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
RUN python3 -m pip install pytest
# uninstall transformers-neuronx package explicitly to avoid version conflict
RUN python3 -m pip uninstall -y transformers-neuronx
COPY . .
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
@ -34,7 +36,7 @@ RUN --mount=type=bind,source=.git,target=.git \
RUN python3 -m pip install -U \
'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
-r requirements-neuron.txt
-r requirements/neuron.txt
ENV VLLM_TARGET_DEVICE neuron
RUN --mount=type=bind,source=.git,target=.git \
@ -43,6 +45,10 @@ RUN --mount=type=bind,source=.git,target=.git \
# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils
# install transformers-neuronx package as an optional dependencies (for V0)
# FIXME: `--no-deps` argument is temporarily added to resolve transformers package version conflict
RUN python3 -m pip install transformers-neuronx==0.13.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U --no-deps
# overwrite entrypoint to run bash script
RUN echo "import subprocess; import sys; subprocess.check_call(sys.argv[1:])" > /usr/local/bin/dockerd-entrypoint.py

View File

@ -16,7 +16,7 @@ RUN --mount=type=bind,source=.git,target=.git \
RUN python3 -m pip install -U pip
# install build requirements
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements-build.txt
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/requirements/build.txt
# build vLLM with OpenVINO backend
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE="openvino" python3 -m pip install /workspace

View File

@ -6,7 +6,7 @@ ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
RUN apt-get update -y && apt-get install -y git wget kmod curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1 libssl-dev
# Some packages in requirements-cpu are installed here
# Some packages in requirements/cpu are installed here
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
# Currently these may not be available for venv or pip directly
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 rust && micromamba clean --all --yes
@ -21,7 +21,7 @@ RUN --mount=type=bind,source=.git,target=.git \
RUN --mount=type=cache,target=/root/.cache/pip \
RUSTFLAGS='-L /opt/conda/lib' pip install -v --prefer-binary --extra-index-url https://repo.fury.io/mgiessing \
'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
-r requirements-cpu.txt \
-r requirements/cpu.txt \
xformers uvloop==0.20.0
RUN --mount=type=bind,source=.git,target=.git \

View File

@ -38,14 +38,14 @@ FROM fetch_vllm AS build_vllm
ARG USE_CYTHON
# Build vLLM
RUN cd vllm \
&& python3 -m pip install -r requirements-rocm.txt \
&& python3 -m pip install -r requirements/rocm.txt \
&& python3 setup.py clean --all \
&& if [ ${USE_CYTHON} -eq "1" ]; then python3 setup_cython.py build_ext --inplace; fi \
&& python3 setup.py bdist_wheel --dist-dir=dist
FROM scratch AS export_vllm
ARG COMMON_WORKDIR
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl /
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements*.txt /
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements /requirements
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
@ -60,7 +60,8 @@ RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
# Install vLLM
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
cd /install \
&& pip install -U -r requirements-rocm.txt \
&& pip install -U -r requirements/rocm.txt \
&& pip install -U -r requirements/rocm-test.txt \
&& pip uninstall -y vllm \
&& pip install *.whl
@ -99,7 +100,7 @@ RUN if [ ${BUILD_RPD} -eq "1" ]; then \
# Install vLLM
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
cd /install \
&& pip install -U -r requirements-rocm.txt \
&& pip install -U -r requirements/rocm.txt \
&& pip uninstall -y vllm \
&& pip install *.whl

View File

@ -6,7 +6,7 @@ ARG RCCL_BRANCH="648a58d"
ARG RCCL_REPO="https://github.com/ROCm/rccl"
ARG TRITON_BRANCH="e5be006"
ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
ARG PYTORCH_BRANCH="8d4926e"
ARG PYTORCH_BRANCH="3a585126"
ARG PYTORCH_VISION_BRANCH="v0.19.1"
ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"

152
Dockerfile.s390x Normal file
View File

@ -0,0 +1,152 @@
# Base UBI image for s390x architecture
ARG BASE_UBI_IMAGE_TAG=9.5-1736404155
ARG PYTHON_VERSION=3.12
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base
# Install basic dependencies
ARG PYTHON_VERSION
ENV PYTHON_VERSION=${PYTHON_VERSION}
WORKDIR /workspace
ENV LANG=C.UTF-8 \
LC_ALL=C.UTF-8
# Install development utilities
RUN microdnf install -y \
which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \
libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
openssl-devel openblas openblas-devel autoconf automake libtool cmake && \
microdnf clean all
# Python Installation
FROM base AS python-install
ARG PYTHON_VERSION
ENV VIRTUAL_ENV=/opt/vllm
ENV PATH="$VIRTUAL_ENV/bin:$PATH"
ENV PYTHON_VERSION=${PYTHON_VERSION}
RUN microdnf install -y \
python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip python${PYTHON_VERSION}-wheel && \
python${PYTHON_VERSION} -m venv $VIRTUAL_ENV && pip install --no-cache -U pip wheel uv && microdnf clean all
FROM python-install AS pyarrow
# Build Apache Arrow
WORKDIR /tmp
RUN --mount=type=cache,target=/root/.cache/uv \
git clone https://github.com/apache/arrow.git && \
cd arrow/cpp && \
mkdir release && cd release && \
cmake -DCMAKE_BUILD_TYPE=Release \
-DCMAKE_INSTALL_PREFIX=/usr/local \
-DARROW_PYTHON=ON \
-DARROW_PARQUET=ON \
-DARROW_ORC=ON \
-DARROW_FILESYSTEM=ON \
-DARROW_WITH_LZ4=ON \
-DARROW_WITH_ZSTD=ON \
-DARROW_WITH_SNAPPY=ON \
-DARROW_JSON=ON \
-DARROW_CSV=ON \
-DARROW_DATASET=ON \
-DPROTOBUF_PROTOC_EXECUTABLE=/usr/bin/protoc \
-DARROW_DEPENDENCY_SOURCE=BUNDLED \
.. && \
make -j$(nproc) && \
make install && \
cd ../../python && \
export PYARROW_PARALLEL=4 && \
export ARROW_BUILD_TYPE=release && \
uv pip install -r requirements/build.txt && \
python setup.py build_ext --build-type=$ARROW_BUILD_TYPE --bundle-arrow-cpp bdist_wheel
FROM python-install AS numa-build
# Install numactl (needed for numa.h dependency)
WORKDIR /tmp
RUN curl -LO https://github.com/numactl/numactl/archive/refs/tags/v2.0.16.tar.gz && \
tar -xvzf v2.0.16.tar.gz && \
cd numactl-2.0.16 && \
./autogen.sh && \
./configure && \
make
# Set include path
ENV C_INCLUDE_PATH="/usr/local/include:$C_INCLUDE_PATH"
FROM python-install AS rust
ENV CARGO_HOME=/root/.cargo
ENV RUSTUP_HOME=/root/.rustup
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \
. "$CARGO_HOME/env" && \
rustup default stable && \
rustup show
FROM python-install AS torch-vision
# Install torchvision
ARG TORCH_VERSION=2.7.0.dev20250304
ARG TORCH_VISION_VERSION=v0.20.1
WORKDIR /tmp
RUN --mount=type=cache,target=/root/.cache/uv \
git clone https://github.com/pytorch/vision.git && \
cd vision && \
git checkout $TORCH_VISION_VERSION && \
uv pip install -v torch==${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/nightly/cpu && \
python setup.py bdist_wheel
# Final build stage
FROM python-install AS vllm-cpu
ARG PYTHON_VERSION
# Set correct library path for torch and numactl
ENV LD_LIBRARY_PATH="/opt/vllm/lib64/python${PYTHON_VERSION}/site-packages/torch/lib:/usr/local/lib:$LD_LIBRARY_PATH"
ENV C_INCLUDE_PATH="/usr/local/include:$C_INCLUDE_PATH"
ENV UV_LINK_MODE=copy
ENV CARGO_HOME=/root/.cargo
ENV RUSTUP_HOME=/root/.rustup
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
COPY . /workspace/vllm
WORKDIR /workspace/vllm
RUN --mount=type=bind,from=numa-build,src=/tmp/numactl-2.0.16,target=/numactl \
make -C /numactl install
# Install dependencies, including PyTorch and Apache Arrow
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
--mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \
--mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
sed -i '/^torch/d' requirements/build.txt && \
ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl | head -n 1) && \
VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl | head -n 1) && \
uv pip install -v \
$ARROW_WHL_FILE \
$VISION_WHL_FILE \
--extra-index-url https://download.pytorch.org/whl/nightly/cpu \
--index-strategy unsafe-best-match \
-r requirements/build.txt \
-r requirements/cpu.txt
# Build and install vllm
RUN --mount=type=cache,target=/root/.cache/uv \
VLLM_TARGET_DEVICE=cpu python setup.py bdist_wheel && \
uv pip install "$(echo dist/*.whl)[tensorizer]"
# setup non-root user for vllm
RUN umask 002 && \
useradd --uid 2000 --gid 0 vllm && \
mkdir -p /home/vllm && \
chmod g+rwx /home/vllm
COPY LICENSE /licenses/vllm.md
COPY examples/*.jinja /app/data/template/
USER 2000
WORKDIR /home/vllm
# Set the default entrypoint
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"]

View File

@ -15,11 +15,14 @@ ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
# Remove existing versions of dependencies
RUN pip uninstall -y torch torch_xla torchvision
ENV VLLM_TARGET_DEVICE="tpu"
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \
python3 -m pip install \
-r requirements-tpu.txt
-r requirements/tpu.txt
RUN python3 setup.py develop
# install development dependencies (for testing)

View File

@ -1,4 +1,4 @@
FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS vllm-base
FROM intel/deep-learning-essentials:2025.0.1-0-devel-ubuntu22.04 AS vllm-base
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
@ -21,30 +21,22 @@ RUN apt-get update -y && \
python3 \
python3-dev \
python3-pip \
# vim \
libze-intel-gpu-dev \
libze-intel-gpu1 \
wget
WORKDIR /workspace/vllm
COPY requirements-xpu.txt /workspace/vllm/requirements-xpu.txt
COPY requirements-common.txt /workspace/vllm/requirements-common.txt
COPY requirements/xpu.txt /workspace/vllm/requirements/xpu.txt
COPY requirements/common.txt /workspace/vllm/requirements/common.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install --no-cache-dir \
-r requirements-xpu.txt
RUN git clone https://github.com/intel/pti-gpu && \
cd pti-gpu/sdk && \
git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \
mkdir build && \
cd build && \
cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \
make -j && \
cmake --install . --config Release --prefix "/usr/local"
-r requirements/xpu.txt
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
COPY . .
ARG GIT_REPO_CHECK
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
@ -54,6 +46,12 @@ RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \
python3 setup.py install
# Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu
# FIXME: This will be fix in ipex 2.7. just leave this here for awareness.
RUN --mount=type=cache,target=/root/.cache/pip \
pip install intel-extension-for-pytorch==2.6.10+xpu \
--extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
CMD ["/bin/bash"]
FROM vllm-base AS vllm-openai

View File

@ -1,9 +1,9 @@
include LICENSE
include requirements-common.txt
include requirements-cuda.txt
include requirements-rocm.txt
include requirements-neuron.txt
include requirements-cpu.txt
include requirements/common.txt
include requirements/cuda.txt
include requirements/rocm.txt
include requirements/neuron.txt
include requirements/cpu.txt
include CMakeLists.txt
recursive-include cmake *

View File

@ -13,11 +13,13 @@ Easy, fast, and cheap LLM serving for everyone
| <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://x.com/vllm_project"><b>Twitter/X</b></a> | <a href="https://slack.vllm.ai"><b>Developer Slack</b></a> |
</p>
---
*Latest News* 🔥
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit#slide=id.g33fb1ff286e_0_29).
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
- [2025/02] We hosted [the ninth vLLM meetup](https://lu.ma/h7g3kuj9) with Meta! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1jzC_PZVXrVNSFVCW-V4cFXb6pn7zZ2CyP_Flwo05aqg/edit?usp=sharing) and AMD [here](https://drive.google.com/file/d/1Zk5qEJIkTmlQ2eQcXQZlljAx3m9s7nwn/view?usp=sharing). The slides from Meta will not be posted.
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing).
- [2025/01] We hosted [the eighth vLLM meetup](https://lu.ma/zep56hui) with Google Cloud! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1epVkt4Zu8Jz_S5OhEHPc798emsYh2BwYfRuDDVEF7u4/edit?usp=sharing), and Google Cloud team [here](https://drive.google.com/file/d/1h24pHewANyRL11xy5dXUbvRC9F9Kkjix/view?usp=sharing).
- [2024/12] vLLM joins [pytorch ecosystem](https://pytorch.org/blog/vllm-joins-pytorch)! Easy, Fast, and Cheap LLM Serving for Everyone!
- [2024/11] We hosted [the seventh vLLM meetup](https://lu.ma/h0qvrajz) with Snowflake! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing), and Snowflake team [here](https://docs.google.com/presentation/d/1qF3RkDAbOULwz9WK5TOltt2fE9t6uIc_hVNLFAaQX6A/edit?usp=sharing).
- [2024/10] We have just created a developer slack ([slack.vllm.ai](https://slack.vllm.ai)) focusing on coordinating contributions and discussing features. Please feel free to join us there!
@ -33,7 +35,9 @@ Easy, fast, and cheap LLM serving for everyone
- [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
---
## About
vLLM is a fast and easy-to-use library for LLM inference and serving.
Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at UC Berkeley, vLLM has evolved into a community-driven project with contributions from both academia and industry.
@ -79,7 +83,7 @@ pip install vllm
```
Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation/index.html)
- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation.html)
- [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
- [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)
@ -127,6 +131,7 @@ We also have an official fundraising venue through [OpenCollective](https://open
## Citation
If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
```bibtex
@inproceedings{kwon2023efficient,
title={Efficient Memory Management for Large Language Model Serving with PagedAttention},
@ -138,11 +143,11 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
## Contact Us
* For technical questions and feature requests, please use Github issues or discussions.
* For discussing with fellow users and coordinating contributions and development, please use Slack.
* For security disclosures, please use Github's security advisory feature.
* For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
- For technical questions and feature requests, please use GitHub issues or discussions.
- For discussing with fellow users and coordinating contributions and development, please use Slack.
- For security disclosures, please use GitHub's security advisory feature.
- For collaborations and partnerships, please contact us at vllm-questions AT lists.berkeley.edu.
## Media Kit
* If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).
- If you wish to use vLLM's logo, please refer to [our media kit repo](https://github.com/vllm-project/media-kit).

54
RELEASE.md Normal file
View File

@ -0,0 +1,54 @@
# Releasing vLLM
vLLM releases offer a reliable version of the code base, packaged into a binary format that can be conveniently accessed via PyPI. These releases also serve as key milestones for the development team to communicate with the community about newly available features, improvements, and upcoming changes that could affect users, including potential breaking changes.
## Release Versioning
vLLM uses a “right-shifted” versioning scheme where a new patch release is out every 2 weeks. And patch releases contain features and bug fixes (as opposed to semver where patch release contains only backwards-compatible bug fixes). When critical fixes need to be made, special release post1 is released.
* _major_ major architectural milestone and when incompatible API changes are made, similar to PyTorch 2.0.
* _minor_ major features
* _patch_ features and backwards-compatible bug fixes
* _post1_ or _patch-1_ backwards-compatible bug fixes, either explicit or implicit post release
## Release Cadence
Patch release is released on bi-weekly basis. Post release 1-3 days after patch release and uses same branch as patch release.
Following is the release cadence for year 2025. All future release dates below are tentative. Please note: Post releases are optional.
| Release Date | Patch release versions | Post Release versions |
| --- | --- | --- |
| Jan 2025 | 0.7.0 | --- |
| Feb 2025 | 0.7.1, 0.7.2, 0.7.3 | --- |
| Mar 2025 | 0.7.4, 0.7.5 | --- |
| Apr 2025 | 0.7.6, 0.7.7 | --- |
| May 2025 | 0.7.8, 0.7.9 | --- |
| Jun 2025 | 0.7.10, 0.7.11 | --- |
| Jul 2025 | 0.7.12, 0.7.13 | --- |
| Aug 2025 | 0.7.14, 0.7.15 | --- |
| Sep 2025 | 0.7.16, 0.7.17 | --- |
| Oct 2025 | 0.7.18, 0.7.19 | --- |
| Nov 2025 | 0.7.20, 0.7.21 | --- |
| Dec 2025 | 0.7.22, 0.7.23 | --- |
## Release branch
Each release is built from a dedicated release branch.
* For _major_, _minor_, _patch_ releases, the release branch cut is performed 1-2 days before release is live.
* For post releases, previously cut release branch is reused
* Release builds are triggered via push to RC tag like vX.Y.Z-rc1 . This enables us to build and test multiple RCs for each release.
* Final tag : vX.Y.Z does not trigger the build but used for Release notes and assets.
* After branch cut is created we monitor the main branch for any reverts and apply these reverts to a release branch.
## Release Cherry-Pick Criteria
After branch cut, we approach finalizing the release branch with clear criteria on what cherry picks are allowed in. Note: a cherry pick is a process to land a PR in the release branch after branch cut. These are typically limited to ensure that the team has sufficient time to complete a thorough round of testing on a stable code base.
* Regression fixes - that address functional/performance regression against the most recent release (e.g. 0.7.0 for 0.7.1 release)
* Critical fixes - critical fixes for severe issue such as silent incorrectness, backwards compatibility, crashes, deadlocks, (large) memory leaks
* Fixes to new features introduced in the most recent release (e.g. 0.7.0 for 0.7.1 release)
* Documentation improvements
* Release branch specific changes (e.g. change version identifiers or CI fixes)
Please note: **No feature work allowed for cherry picks**. All PRs that are considered for cherry-picks need to be merged on trunk, the only exception are Release branch specific changes.

View File

@ -1,19 +1,217 @@
# Benchmarking vLLM
## Downloading the ShareGPT dataset
This README guides you through running benchmark tests with the extensive
datasets supported on vLLM. Its a living document, updated as new features and datasets
become available.
## Dataset Overview
<table style="width:100%; border-collapse: collapse;">
<thead>
<tr>
<th style="width:15%; text-align: left;">Dataset</th>
<th style="width:10%; text-align: center;">Online</th>
<th style="width:10%; text-align: center;">Offline</th>
<th style="width:65%; text-align: left;">Data Path</th>
</tr>
</thead>
<tbody>
<tr>
<td><strong>ShareGPT</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json</code></td>
</tr>
<tr>
<td><strong>BurstGPT</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv</code></td>
</tr>
<tr>
<td><strong>Sonnet</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td>Local file: <code>benchmarks/sonnet.txt</code></td>
</tr>
<tr>
<td><strong>Random</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>synthetic</code></td>
</tr>
<tr>
<td><strong>HuggingFace</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;">🟡</td>
<td>Specify your dataset path on HuggingFace</td>
</tr>
<tr>
<td><strong>VisionArena</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>lmarena-ai/vision-arena-bench-v0.1</code> (a HuggingFace dataset)</td>
</tr>
</tbody>
</table>
✅: supported
🚧: to be supported
🟡: Partial support. Currently, HuggingFaceDataset only supports dataset formats
similar to `lmms-lab/LLaVA-OneVision-Data`. If you need support for other dataset
formats, please consider contributing.
**Note**: VisionArenas `dataset-name` should be set to `hf`
---
## Example - Online Benchmark
First start serving your model
You can download the dataset by running:
```bash
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
vllm serve ${MODEL_NAME} --disable-log-requests
```
## Downloading the ShareGPT4V dataset
Then run the benchmarking script
The json file refers to several image datasets (coco, llava, etc.). The benchmark scripts
will ignore a datapoint if the referred image is missing.
```bash
wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/resolve/main/sharegpt4v_instruct_gpt4-vision_cap100k.json
mkdir coco -p
wget http://images.cocodataset.org/zips/train2017.zip -O coco/train2017.zip
unzip coco/train2017.zip -d coco/
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
NUM_PROMPTS=10
BACKEND="vllm"
DATASET_NAME="sharegpt"
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
python3 vllm/benchmarks/benchmark_serving.py --backend ${BACKEND} --model ${MODEL_NAME} --endpoint /v1/completions --dataset-name ${DATASET_NAME} --dataset-path ${DATASET_PATH} --num-prompts ${NUM_PROMPTS}
```
If successful, you will see the following output
```
============ Serving Benchmark Result ============
Successful requests: 10
Benchmark duration (s): 5.78
Total input tokens: 1369
Total generated tokens: 2212
Request throughput (req/s): 1.73
Output token throughput (tok/s): 382.89
Total Token throughput (tok/s): 619.85
---------------Time to First Token----------------
Mean TTFT (ms): 71.54
Median TTFT (ms): 73.88
P99 TTFT (ms): 79.49
-----Time per Output Token (excl. 1st token)------
Mean TPOT (ms): 7.91
Median TPOT (ms): 7.96
P99 TPOT (ms): 8.03
---------------Inter-token Latency----------------
Mean ITL (ms): 7.74
Median ITL (ms): 7.70
P99 ITL (ms): 8.39
==================================================
```
### VisionArena Benchmark for Vision Language Models
```bash
# need a model with vision capability here
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
```
```bash
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
NUM_PROMPTS=10
BACKEND="openai-chat"
DATASET_NAME="hf"
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
DATASET_SPLIT='train'
python3 vllm/benchmarks/benchmark_serving.py \
--backend "${BACKEND}" \
--model "${MODEL_NAME}" \
--endpoint "/v1/chat/completions" \
--dataset-name "${DATASET_NAME}" \
--dataset-path "${DATASET_PATH}" \
--hf-split "${DATASET_SPLIT}" \
--num-prompts "${NUM_PROMPTS}"
```
---
## Example - Offline Throughput Benchmark
```bash
MODEL_NAME="NousResearch/Hermes-3-Llama-3.1-8B"
NUM_PROMPTS=10
DATASET_NAME="sonnet"
DATASET_PATH="vllm/benchmarks/sonnet.txt"
python3 vllm/benchmarks/benchmark_throughput.py \
--model "${MODEL_NAME}" \
--dataset-name "${DATASET_NAME}" \
--dataset-path "${DATASET_PATH}" \
--num-prompts "${NUM_PROMPTS}"
```
If successful, you will see the following output
```
Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
Total num prompt tokens: 5014
Total num output tokens: 1500
```
### VisionArena Benchmark for Vision Language Models
``` bash
MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
NUM_PROMPTS=10
DATASET_NAME="hf"
DATASET_PATH="lmarena-ai/vision-arena-bench-v0.1"
DATASET_SPLIT="train"
python3 vllm/benchmarks/benchmark_throughput.py \
--model "${MODEL_NAME}" \
--backend "vllm-chat" \
--dataset-name "${DATASET_NAME}" \
--dataset-path "${DATASET_PATH}" \
--num-prompts "${NUM_PROMPTS}" \
--hf-split "${DATASET_SPLIT}"
```
The `num prompt tokens` now includes image token counts
```
Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
Total num prompt tokens: 14527
Total num output tokens: 1280
```
### Benchmark with LoRA Adapters
``` bash
# download dataset
# wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
MODEL_NAME="meta-llama/Llama-2-7b-hf"
BACKEND="vllm"
DATASET_NAME="sharegpt"
DATASET_PATH="<your data path>/ShareGPT_V3_unfiltered_cleaned_split.json"
NUM_PROMPTS=10
MAX_LORAS=2
MAX_LORA_RANK=8
ENABLE_LORA="--enable-lora"
LORA_PATH="yard1/llama-2-7b-sql-lora-test"
python3 vllm/benchmarks/benchmark_throughput.py \
--model "${MODEL_NAME}" \
--backend "${BACKEND}" \
--dataset_path "${DATASET_PATH}" \
--dataset_name "${DATASET_NAME}" \
--num-prompts "${NUM_PROMPTS}" \
--max-loras "${MAX_LORAS}" \
--max-lora-rank "${MAX_LORA_RANK}" \
${ENABLE_LORA} \
--lora-path "${LORA_PATH}"
```

View File

@ -6,7 +6,7 @@ import sys
import time
import traceback
from dataclasses import dataclass, field
from typing import List, Optional, Union
from typing import Optional, Union
import aiohttp
import huggingface_hub.constants
@ -14,6 +14,9 @@ from tqdm.asyncio import tqdm
from transformers import (AutoTokenizer, PreTrainedTokenizer,
PreTrainedTokenizerFast)
# NOTE(simon): do not import vLLM here so the benchmark script
# can run without vLLM installed.
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
@ -25,7 +28,6 @@ class RequestFuncInput:
output_len: int
model: str
model_name: Optional[str] = None
best_of: int = 1
logprobs: Optional[int] = None
extra_body: Optional[dict] = None
multi_modal_content: Optional[dict] = None
@ -39,8 +41,8 @@ class RequestFuncOutput:
latency: float = 0.0
output_tokens: int = 0
ttft: float = 0.0 # Time to first token
itl: List[float] = field(
default_factory=list) # List of inter-token latencies
itl: list[float] = field(
default_factory=list) # list of inter-token latencies
tpot: float = 0.0 # avg next-token latencies
prompt_len: int = 0
error: str = ""
@ -56,7 +58,6 @@ async def async_request_tgi(
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
params = {
"best_of": request_func_input.best_of,
"max_new_tokens": request_func_input.output_len,
"do_sample": True,
"temperature": 0.01, # TGI does not accept 0.0 temperature.
@ -128,7 +129,6 @@ async def async_request_trt_llm(
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
assert request_func_input.best_of == 1
payload = {
"accumulate_tokens": True,
"text_input": request_func_input.prompt,
@ -193,7 +193,6 @@ async def async_request_deepspeed_mii(
) -> RequestFuncOutput:
async with aiohttp.ClientSession(trust_env=True,
timeout=AIOHTTP_TIMEOUT) as session:
assert request_func_input.best_of == 1
payload = {
"prompt": request_func_input.prompt,
@ -247,7 +246,6 @@ async def async_request_openai_completions(
if request_func_input.model_name else request_func_input.model,
"prompt": request_func_input.prompt,
"temperature": 0.0,
"best_of": request_func_input.best_of,
"max_tokens": request_func_input.output_len,
"logprobs": request_func_input.logprobs,
"stream": True,
@ -336,7 +334,7 @@ async def async_request_openai_chat_completions(
) -> RequestFuncOutput:
api_url = request_func_input.api_url
assert api_url.endswith(
"chat/completions"
("chat/completions", "profile")
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
async with aiohttp.ClientSession(trust_env=True,
@ -430,12 +428,17 @@ def get_model(pretrained_model_name_or_path: str) -> str:
if os.getenv('VLLM_USE_MODELSCOPE', 'False').lower() == 'true':
from modelscope import snapshot_download
model_path = snapshot_download(
model_id=pretrained_model_name_or_path,
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
from vllm.model_executor.model_loader.weight_utils import get_lock
return model_path
# Use file lock to prevent multiple processes from
# downloading the same model weights at the same time.
with get_lock(pretrained_model_name_or_path):
model_path = snapshot_download(
model_id=pretrained_model_name_or_path,
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
return model_path
return pretrained_model_name_or_path

View File

@ -0,0 +1,688 @@
# SPDX-License-Identifier: Apache-2.0
"""
This module defines a framework for sampling benchmark requests from various
datasets. Each dataset subclass of BenchmarkDataset must implement sample
generation. Supported dataset types include:
- ShareGPT
- Random (synthetic)
- Sonnet
- BurstGPT
- HuggingFace
- VisionArena
TODO: Implement CustomDataset to parse a JSON file and convert its contents into
SampleRequest instances, similar to the approach used in ShareGPT.
"""
import base64
import io
import json
import random
from abc import ABC, abstractmethod
from collections.abc import Mapping
from dataclasses import dataclass
from functools import cache
from typing import Any, Optional, Union
import numpy as np
import pandas as pd
from datasets import load_dataset
from PIL import Image
from transformers import PreTrainedTokenizerBase
from vllm.lora.request import LoRARequest
from vllm.lora.utils import get_adapter_absolute_path
from vllm.multimodal import MultiModalDataDict
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
# -----------------------------------------------------------------------------
# Data Classes
# -----------------------------------------------------------------------------
@dataclass
class SampleRequest:
"""
Represents a single inference request for benchmarking.
"""
prompt: Union[str, Any]
prompt_len: int
expected_output_len: int
multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None
lora_request: Optional[LoRARequest] = None
# -----------------------------------------------------------------------------
# Benchmark Dataset Base Class
# -----------------------------------------------------------------------------
class BenchmarkDataset(ABC):
DEFAULT_SEED = 0
# num_requests has default 1000 in both the benchmark_serving.py and
# benchmark_throughput.py
def __init__(
self,
dataset_path: Optional[str] = None,
random_seed: int = DEFAULT_SEED,
) -> None:
"""
Initialize the BenchmarkDataset with an optional dataset path and random
seed. Args:
dataset_path (Optional[str]): Path to the dataset. If None, it
indicates that a default or random dataset might be used.
random_seed (int): Seed value for reproducible shuffling or
sampling. Defaults to DEFAULT_SEED.
"""
self.dataset_path = dataset_path
# Set the random seed, ensuring that a None value is replaced with the
# default seed.
self.random_seed = (random_seed
if random_seed is not None else self.DEFAULT_SEED)
self.data = None
def apply_multimodal_chat_transformation(
self,
prompt: str,
mm_content: Optional[MultiModalDataDict] = None) -> list[dict]:
"""
Transform a prompt and optional multimodal content into a chat format.
This method is used for chat models that expect a specific
conversation format.
"""
content = [{"text": prompt, "type": "text"}]
if mm_content is not None:
content.append(mm_content)
return [{"role": "user", "content": content}]
def load_data(self) -> None:
"""
Load data from the dataset path into self.data.
This method must be overridden by subclasses since the method to load
data will vary depending on the dataset format and source.
Raises:
NotImplementedError: If a subclass does not implement this method.
"""
# TODO (jenniferzhao): add support for downloading data
raise NotImplementedError(
"load_data must be implemented in subclasses.")
def get_random_lora_request(
self,
tokenizer: PreTrainedTokenizerBase,
max_loras: Optional[int] = None,
lora_path: Optional[str] = None,
) -> tuple[Optional[LoRARequest], AnyTokenizer]:
"""
Optionally select a random LoRA request and return its associated
tokenizer.
This method is used when LoRA parameters are provided. It randomly
selects a LoRA based on max_loras and retrieves a cached tokenizer for
that LoRA if available. Otherwise, it returns the base tokenizer.
Args:
tokenizer (PreTrainedTokenizerBase): The base tokenizer to use if no
LoRA is selected. max_loras (Optional[int]): The maximum number of
LoRAs available. If None, LoRA is not used. lora_path
(Optional[str]): Path to the LoRA parameters on disk. If None, LoRA
is not used.
Returns:
tuple[Optional[LoRARequest], AnyTokenizer]: A tuple where the first
element is a LoRARequest (or None if not applicable) and the second
element is the tokenizer associated with the LoRA request (or the
base tokenizer).
"""
if max_loras is None or lora_path is None:
return None, tokenizer
# Generate a random LoRA ID in the range [1, max_loras].
lora_id = random.randint(1, max_loras)
lora_request = LoRARequest(
lora_name=str(lora_id),
lora_int_id=lora_id,
lora_path=lora_path_on_disk(lora_path),
)
if lora_id not in lora_tokenizer_cache:
lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
# Return lora_request and the cached tokenizer if available; otherwise,
# return the base tokenizer
return lora_request, lora_tokenizer_cache[lora_id] or tokenizer
@abstractmethod
def sample(self, tokenizer: PreTrainedTokenizerBase,
num_requests: int) -> list[SampleRequest]:
"""
Abstract method to generate sample requests from the dataset.
Subclasses must override this method to implement dataset-specific logic
for generating a list of SampleRequest objects.
Args:
tokenizer (PreTrainedTokenizerBase): The tokenizer to be used
for processing the dataset's text.
num_requests (int): The number of sample requests to generate.
Returns:
list[SampleRequest]: A list of sample requests generated from the
dataset.
"""
raise NotImplementedError("sample must be implemented in subclasses.")
# -----------------------------------------------------------------------------
# Utility Functions and Global Caches
# -----------------------------------------------------------------------------
def is_valid_sequence(
prompt_len: int,
output_len: int,
min_len: int = 4,
max_prompt_len: int = 1024,
max_total_len: int = 2048,
skip_min_output_len_check: bool = False,
) -> bool:
"""
Validate a sequence based on prompt and output lengths.
Default pruning criteria are copied from the original `sample_hf_requests`
and `sample_sharegpt_requests` functions in benchmark_serving.py, as well as
from `sample_requests` in benchmark_throughput.py.
"""
# Check for invalid conditions
prompt_too_short = prompt_len < min_len
output_too_short = (not skip_min_output_len_check) and (output_len
< min_len)
prompt_too_long = prompt_len > max_prompt_len
combined_too_long = (prompt_len + output_len) > max_total_len
# Return True if none of the invalid conditions are met
return not (prompt_too_short or output_too_short or prompt_too_long
or combined_too_long)
@cache
def lora_path_on_disk(lora_path: str) -> str:
return get_adapter_absolute_path(lora_path)
# Global cache for LoRA tokenizers.
lora_tokenizer_cache: dict[int, AnyTokenizer] = {}
def process_image(image: Any) -> Mapping[str, Any]:
"""
Process a single image input and return a multimedia content dictionary.
For a PIL.Image.Image input:
- Converts the image to RGB.
- Saves the image as a JPEG in-memory.
- Encodes the JPEG data as a base64 string.
- Returns a dictionary with the image as a base64 data URL.
For a string input:
- Treats the string as a URL or file path.
- Prepends "file://" if the string doesn't start with "http://" or
"file://".
- Returns a dictionary with the image URL.
Raises:
ValueError: If the input is neither a PIL.Image.Image nor a string.
"""
if isinstance(image, Image.Image):
image = image.convert("RGB")
with io.BytesIO() as image_data:
image.save(image_data, format="JPEG")
image_base64 = base64.b64encode(
image_data.getvalue()).decode("utf-8")
return {
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_base64}"
},
}
if isinstance(image, str):
image_url = (image if image.startswith(
("http://", "file://")) else f"file://{image}")
return {"type": "image_url", "image_url": {"url": image_url}}
raise ValueError(
f"Invalid image input {image}. Must be a PIL.Image.Image or str.")
# -----------------------------------------------------------------------------
# Random Dataset Implementation (Synthetic Data)
# -----------------------------------------------------------------------------
class RandomDataset(BenchmarkDataset):
# Default values copied from benchmark_serving.py for the random dataset.
DEFAULT_PREFIX_LEN = 0
DEFAULT_RANGE_RATIO = 1.0
DEFAULT_INPUT_LEN = 1024
DEFAULT_OUTPUT_LEN = 128
def __init__(
self,
**kwargs,
) -> None:
super().__init__(**kwargs)
def sample(self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
prefix_len: int = DEFAULT_PREFIX_LEN,
range_ratio: float = DEFAULT_RANGE_RATIO,
input_len: int = DEFAULT_INPUT_LEN,
output_len: int = DEFAULT_OUTPUT_LEN,
**kwargs) -> list[SampleRequest]:
vocab_size = tokenizer.vocab_size
prefix_token_ids = (np.random.randint(
0, vocab_size, size=prefix_len).tolist() if prefix_len > 0 else [])
input_low = int(input_len * range_ratio)
output_low = int(output_len * range_ratio)
input_lens = np.random.randint(input_low,
input_len + 1,
size=num_requests)
output_lens = np.random.randint(output_low,
output_len + 1,
size=num_requests)
offsets = np.random.randint(0, vocab_size, size=num_requests)
requests = []
for i in range(num_requests):
inner_seq = ((offsets[i] + i + np.arange(input_lens[i])) %
vocab_size).tolist()
token_sequence = prefix_token_ids + inner_seq
prompt = tokenizer.decode(token_sequence)
total_input_len = prefix_len + int(input_lens[i])
requests.append(
SampleRequest(
prompt=prompt,
prompt_len=total_input_len,
expected_output_len=int(output_lens[i]),
))
return requests
# -----------------------------------------------------------------------------
# ShareGPT Dataset Implementation
# -----------------------------------------------------------------------------
class ShareGPTDataset(BenchmarkDataset):
"""
Implements the ShareGPT dataset. Loads data from a JSON file and generates
sample requests based on conversation turns.
"""
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
self.load_data()
def load_data(self) -> None:
if self.dataset_path is None:
raise ValueError("dataset_path must be provided for loading data.")
with open(self.dataset_path, encoding="utf-8") as f:
self.data = json.load(f)
# Filter entries with at least two conversation turns.
self.data = [
entry for entry in self.data
if "conversations" in entry and len(entry["conversations"]) >= 2
]
random.seed(self.random_seed)
random.shuffle(self.data)
def sample(self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
lora_path: Optional[str] = None,
max_loras: Optional[int] = None,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
**kwargs) -> list:
samples: list = []
for entry in self.data:
if len(samples) >= num_requests:
break
prompt, completion = entry["conversations"][0]["value"],\
entry["conversations"][1]["value"]
lora_request, tokenizer = self.get_random_lora_request(
tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
prompt_ids = tokenizer(prompt).input_ids
completion_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_ids)
new_output_len = (len(completion_ids)
if output_len is None else output_len)
if not is_valid_sequence(prompt_len,
new_output_len,
skip_min_output_len_check=output_len
is not None):
continue
if enable_multimodal_chat:
prompt = self.apply_multimodal_chat_transformation(
prompt, None)
samples.append(
SampleRequest(
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=new_output_len,
lora_request=lora_request,
))
return samples
# -----------------------------------------------------------------------------
# Sonnet Dataset Implementation
# -----------------------------------------------------------------------------
class SonnetDataset(BenchmarkDataset):
"""
Simplified implementation of the Sonnet dataset. Loads poem lines from a
text file and generates sample requests. Default values here copied from
`benchmark_serving.py` for the sonnet dataset.
"""
DEFAULT_PREFIX_LEN = 200
DEFAULT_INPUT_LEN = 550
DEFAULT_OUTPUT_LEN = 150
def __init__(
self,
**kwargs,
) -> None:
super().__init__(**kwargs)
self.load_data()
def load_data(self) -> None:
if not self.dataset_path:
raise ValueError("dataset_path must be provided.")
with open(self.dataset_path, encoding="utf-8") as f:
self.data = f.readlines()
def sample(self,
tokenizer,
num_requests: int,
prefix_len: int = DEFAULT_PREFIX_LEN,
input_len: int = DEFAULT_INPUT_LEN,
output_len: int = DEFAULT_OUTPUT_LEN,
return_prompt_formatted: bool = False,
**kwargs) -> list:
# Calculate average token length for a poem line.
tokenized_lines = [tokenizer(line).input_ids for line in self.data]
avg_len = sum(len(tokens)
for tokens in \
tokenized_lines) / len(tokenized_lines)
# Build the base prompt.
base_prompt = "Pick as many lines as you can from these poem lines:\n"
base_msg = [{"role": "user", "content": base_prompt}]
base_fmt = tokenizer.apply_chat_template(base_msg,
add_generation_prompt=True,
tokenize=False)
base_offset = len(tokenizer(base_fmt).input_ids)
if input_len <= base_offset:
raise ValueError(
f"'input_len' must be higher than the base prompt length "
f"({base_offset}).")
# Determine how many poem lines to use.
num_input_lines = round((input_len - base_offset) / avg_len)
num_prefix_lines = round((prefix_len - base_offset) / avg_len)
prefix_lines = self.data[:num_prefix_lines]
samples = []
for _ in range(num_requests):
extra_lines = random.choices(self.data,
k=num_input_lines - num_prefix_lines)
prompt = f"{base_prompt}{''.join(prefix_lines + extra_lines)}"
msg = [{"role": "user", "content": prompt}]
prompt_formatted = tokenizer.apply_chat_template(
msg, add_generation_prompt=True, tokenize=False)
prompt_len = len(tokenizer(prompt_formatted).input_ids)
samples.append(
SampleRequest(
prompt=prompt_formatted
if return_prompt_formatted else prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
))
return samples
# -----------------------------------------------------------------------------
# BurstGPT Dataset Implementation
# -----------------------------------------------------------------------------
class BurstGPTDataset(BenchmarkDataset):
"""
Implements the BurstGPT dataset. Loads data from a CSV file and generates
sample requests based on synthetic prompt generation. Only rows with Model
"GPT-4" and positive response tokens are used.
"""
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
self.load_data()
def load_data(self, ):
if self.dataset_path is None:
raise ValueError("dataset_path must be provided for loading data.")
df = pd.read_csv(self.dataset_path)
# Filter to keep only GPT-4 rows.
gpt4_df = df[df["Model"] == "GPT-4"]
# Remove failed requests (where Response tokens is 0 or less).
gpt4_df = gpt4_df[gpt4_df["Response tokens"] > 0]
# Sample the desired number of rows.
self.data = gpt4_df
def _sample_loaded_data(self, num_requests: int) -> list:
if num_requests <= len(self.data):
data = self.data.sample(n=num_requests,
random_state=self.random_seed)
else:
data = self.data.sample(
n=num_requests,
random_state=self.random_seed,
replace=True,
)
# Convert the dataframe to a list of lists.
return data.values.tolist()
def sample(self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
max_loras: Optional[int] = None,
lora_path: Optional[str] = None,
**kwargs) -> list[SampleRequest]:
samples = []
data = self._sample_loaded_data(num_requests=num_requests)
for i in range(num_requests):
input_len = int(data[i][2])
output_len = int(data[i][3])
lora_req, tokenizer = self.get_random_lora_request(
tokenizer=tokenizer, max_loras=max_loras, lora_path=lora_path)
vocab_size = tokenizer.vocab_size
# Generate a synthetic prompt: a list of token IDs computed as (i +
# j) modulo vocab_size.
token_ids = [(i + j) % vocab_size for j in range(input_len)]
prompt = tokenizer.decode(token_ids)
samples.append(
SampleRequest(
prompt=prompt,
prompt_len=input_len,
expected_output_len=output_len,
lora_request=lora_req,
))
return samples
# -----------------------------------------------------------------------------
# HuggingFace Dataset Implementation
# -----------------------------------------------------------------------------
class HuggingFaceDataset(BenchmarkDataset):
"""
Dataset class for processing a HuggingFace dataset with conversation data
and optional images.
"""
DEFAULT_NUM_REQUESTS = 1000
def __init__(
self,
dataset_split: str,
dataset_subset: Optional[str] = None,
**kwargs,
) -> None:
super().__init__(**kwargs)
self.dataset_split = dataset_split
self.dataset_subset = dataset_subset
self.load_data()
def load_data(self) -> None:
if not self.dataset_path:
raise ValueError("dataset_path must be provided for loading data.")
self.data = load_dataset(
self.dataset_path,
name=self.dataset_subset,
split=self.dataset_split,
streaming=True,
)
if self.data.features is None or "conversations" \
not in self.data.features:
raise ValueError(
"HuggingFaceDataset currently only supports datasets with "
"a 'conversations' column like lmms-lab/LLaVA-OneVision-Data. "
"Please consider contributing if you would like to add "
"support for additional dataset formats.")
# Shuffle and filter examples with at least 2 conversations.
self.data = self.data.shuffle(seed=self.random_seed).filter(
lambda x: len(x["conversations"]) >= 2)
def sample(self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
**kwargs) -> list:
sampled_requests = []
dynamic_output = output_len is None
for item in self.data:
if len(sampled_requests) >= num_requests:
break
conv = item["conversations"]
prompt, completion = conv[0]["value"], conv[1]["value"]
prompt_ids = tokenizer(prompt).input_ids
completion_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_ids)
completion_len = len(completion_ids)
output_len = completion_len if dynamic_output else output_len
assert isinstance(output_len, int) and output_len > 0
if dynamic_output and not is_valid_sequence(
prompt_len, completion_len):
continue
mm_content = process_image(
item["image"]) if "image" in item else None
if enable_multimodal_chat:
# Note: when chat is enabled the request prompt_len is no longer
# accurate and we will be using request output to count the
# actual prompt len and output len
prompt = self.apply_multimodal_chat_transformation(
prompt, mm_content)
sampled_requests.append(
SampleRequest(
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
multi_modal_data=mm_content,
))
return sampled_requests
# -----------------------------------------------------------------------------
# Vision Arena Dataset Implementation
# -----------------------------------------------------------------------------
class VisionArenaDataset(HuggingFaceDataset):
"""
Vision Arena Dataset.
"""
DEFAULT_OUTPUT_LEN = 128
DEFAULT_NUM_REQUESTS = 1000
VISION_ARENA_DATASET_PATH = "lmarena-ai/vision-arena-bench-v0.1"
def __init__(
self,
**kwargs,
) -> None:
super().__init__(**kwargs)
if self.dataset_path != self.VISION_ARENA_DATASET_PATH:
raise ValueError(f"Only support Vision Arena dataset.\
This data path {self.dataset_path} is not valid.")
if self.dataset_subset is None and self.dataset_split != "train":
raise ValueError("Dataset split must be 'train'.")
self.load_data()
def load_data(self) -> None:
dataset = load_dataset(
self.dataset_path,
name=self.dataset_subset,
split=self.dataset_split,
streaming=True,
)
self.data = dataset.shuffle(seed=self.random_seed)
def sample(self,
tokenizer: PreTrainedTokenizerBase,
num_requests: int,
output_len: Optional[int] = None,
enable_multimodal_chat: bool = False,
**kwargs) -> list:
output_len = (output_len
if output_len is not None else self.DEFAULT_OUTPUT_LEN)
sampled_requests = []
for item in self.data:
if len(sampled_requests) >= num_requests:
break
prompt = item["turns"][0][0]["content"]
mm_content = process_image(item["images"][0])
prompt_len = len(tokenizer(prompt).input_ids)
if enable_multimodal_chat:
# Note: when chat is enabled the request prompt_len is no longer
# accurate and we will be using request output to count the
# actual prompt len
prompt = self.apply_multimodal_chat_transformation(
prompt, mm_content)
sampled_requests.append(
SampleRequest(
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
multi_modal_data=mm_content,
))
return sampled_requests

View File

@ -1,495 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
"""Benchmark guided decoding throughput."""
import argparse
import dataclasses
import json
import os
import random
import time
from typing import List
import datasets
import pandas as pd
import uvloop
from transformers import AutoTokenizer, PreTrainedTokenizerBase
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args)
from vllm.sampling_params import GuidedDecodingParams
from vllm.utils import FlexibleArgumentParser, merge_async_iterators
@dataclasses.dataclass
class SampleRequest:
"""A class representing a single inference request for benchmarking.
Attributes:
prompt: The input text prompt for the model.
multi_modal_data: Optional dictionary containing multi-modal data (e.g.
images).
prompt_len: The length of the prompt in tokens.
expected_output_len: The expected length of the output in tokens.
"""
prompt: str
prompt_len: int
expected_output_len: int
schema: dict
structure_type: str = 'json'
completion: str = None
def run_vllm(requests: List[SampleRequest],
engine_args: EngineArgs,
n: int,
guided_decoding_rate: float = 1.0,
warmup: bool = False) -> float:
from vllm import LLM, SamplingParams
llm = LLM(**vars(engine_args))
# Add the requests to the engine.
prompts: List[str] = []
sampling_params: List[SamplingParams] = []
# create a list containing random selected true or false
guided_decoding_req_idx = random.sample(
range(len(requests)), int(len(requests) * guided_decoding_rate))
if warmup:
print(">>>>> Running warmup prompt, for the first 5")
# We setup the first 5 requests to warmup FSM
# if using xgrammar dataset, we will skip warmup
warmup_requests = requests[:5]
for i, request in enumerate(warmup_requests):
prompts.append(request.prompt)
sampling_params.append(
SamplingParams(
n=n,
temperature=1.0,
top_p=1.0,
ignore_eos=True,
max_tokens=request.expected_output_len,
guided_decoding=GuidedDecodingParams(json=request.schema)
if guided_decoding_rate > 0 else None,
))
llm.generate(prompts, sampling_params, use_tqdm=False)
print(">>>>> Benchmark started...")
prompts = []
sampling_params = []
for i, request in enumerate(requests):
prompts.append(request.prompt)
sampling_params.append(
SamplingParams(
n=n,
temperature=1.0,
top_p=1.0,
ignore_eos=True,
max_tokens=request.expected_output_len,
guided_decoding=GuidedDecodingParams(
**{request.structure_type: request.schema})
if i in guided_decoding_req_idx else None,
))
start = time.perf_counter()
outputs = llm.generate(prompts, sampling_params, use_tqdm=False)
ret = []
for output, request in zip(outputs, requests):
generated_text = output.outputs[0].text
ret.append({
"generated": generated_text,
"expected": request.completion
})
end = time.perf_counter()
return end - start, ret
async def run_vllm_async(
requests: List[SampleRequest],
engine_args: AsyncEngineArgs,
n: int,
guided_decoding_rate: float = 1.0,
warmup: bool = False,
disable_frontend_multiprocessing: bool = False) -> float:
from vllm import SamplingParams
async with build_async_engine_client_from_engine_args(
engine_args, disable_frontend_multiprocessing) as llm:
# Add the requests to the engine.
prompts: List[str] = []
sampling_params: List[SamplingParams] = []
guided_decoding_req_idx = random.sample(
range(len(requests)), int(len(requests) * guided_decoding_rate))
if warmup:
print(">>>>>> Running warmup prompt, for the first 5")
# We setup the first 5 requests to warmup FSM
# if using xgrammar dataset, we will skip warmup
warmup_requests = requests[:5]
for i, request in enumerate(warmup_requests):
prompts.append(request.prompt)
sampling_params.append(
SamplingParams(
n=n,
temperature=1.0,
top_p=1.0,
ignore_eos=True,
max_tokens=request.expected_output_len,
guided_decoding=GuidedDecodingParams(
json=request.schema)
if guided_decoding_rate > 0 else None,
))
generators = []
for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
generator = llm.generate(prompt, sp, request_id=f"test{i}")
generators.append(generator)
all_gens = merge_async_iterators(*generators)
async for i, res in all_gens:
pass
print(">>>>> Benchmark started...")
prompts = []
sampling_params = []
for i, request in enumerate(requests):
prompts.append(request.prompt)
sampling_params.append(
SamplingParams(
n=n,
temperature=1.0,
top_p=1.0,
ignore_eos=True,
max_tokens=request.expected_output_len,
guided_decoding=GuidedDecodingParams(json=request.schema)
if i in guided_decoding_req_idx else None,
))
generators = []
start_time = []
latencies = []
start = time.perf_counter()
for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)):
generator = llm.generate(prompt, sp, request_id=f"test{i}")
generators.append(generator)
start_time.append(time.perf_counter())
latencies.append([])
all_gens = merge_async_iterators(*generators)
generated_texts = [''] * len(requests)
async for i, res in all_gens:
generated_texts[i] = res.outputs[0].text
lat = time.perf_counter() - start_time[i]
latencies[i].append(lat)
ret = [{
'generated': gt,
'expected': req.completion
} for gt, req in zip(generated_texts, requests)]
end = time.perf_counter()
first_latency = pd.Series([lat[0] * 1000 for lat in latencies])
next_latency = pd.Series([(lat[-1] - lat[0]) / len(lat[1:]) * 1000
for lat in latencies])
return end - start, ret, (first_latency, next_latency)
def sample_requests(tokenizer: PreTrainedTokenizerBase,
args: argparse.Namespace) -> List[SampleRequest]:
if args.dataset == 'json':
if args.json_schema_path is None:
dir_path = os.path.dirname(os.path.realpath(__file__))
args.json_schema_path = os.path.join(dir_path,
"structured_schemas",
"structured_schema_1.json")
with open(args.json_schema_path) as f:
schema = json.load(f)
prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501
input_len = len(tokenizer(prompt).input_ids)
print(f"Input length of the prompt: {input_len} tokens")
requests = [
SampleRequest(prompt=prompt,
prompt_len=input_len,
expected_output_len=args.output_len,
schema=schema,
structure_type=args.structure_type)
for _ in range(args.num_prompts)
]
elif args.dataset == "grammar":
schema = """
?start: select_statement
?select_statement: "SELECT " column_list " FROM " table_name
?column_list: column_name ("," column_name)*
?table_name: identifier
?column_name: identifier
?identifier: /[a-zA-Z_][a-zA-Z0-9_]*/
"""
prompt = "Generate an SQL query to show the 'username' \
and 'email' from the 'users' table."
input_len = len(tokenizer(prompt).input_ids)
print(f"Input length of the prompt: {input_len} tokens")
requests = [
SampleRequest(prompt=prompt,
prompt_len=input_len,
expected_output_len=args.output_len,
schema=schema,
structure_type=args.structure_type)
for _ in range(args.num_prompts)
]
elif args.dataset == "regex":
regex = r"\w+@\w+\.com\n"
args.regex = regex
prompt = "Generate an email address for Alan Turing, \
who works in Enigma. End in .com and new line. \
Example result: alan.turing@enigma.com\n"
input_len = len(tokenizer(prompt).input_ids)
print(f"Input length of the prompt: {input_len} tokens")
requests = [
SampleRequest(prompt=prompt,
prompt_len=input_len,
expected_output_len=args.output_len,
schema=regex,
structure_type=args.structure_type)
for _ in range(args.num_prompts)
]
elif args.dataset == "choice":
choice = ["Positive", "Negative"]
args.choice = choice
prompt = "Classify this sentiment: vLLM is wonderful!"
input_len = len(tokenizer(prompt).input_ids)
print(f"Input length of the prompt: {input_len} tokens")
requests = [
SampleRequest(prompt=prompt,
prompt_len=input_len,
expected_output_len=args.output_len,
schema=choice,
structure_type=args.structure_type)
for _ in range(args.num_prompts)
]
elif args.dataset == "xgrammar_bench":
args.warmup = False
requests: List[SampleRequest] = []
dataset = datasets.load_dataset("NousResearch/json-mode-eval",
split="train")
print(f"dataset has {len(dataset)} entries")
len_dataset = len(dataset)
for data_point_idx in range(args.num_prompts):
idx = data_point_idx
while idx >= len_dataset:
idx -= len_dataset
schema = dataset["schema"][idx]
prompt = tokenizer.apply_chat_template(dataset["prompt"][idx],
tokenize=False)
input_len = len(tokenizer(prompt).input_ids)
completion = dataset["completion"][idx]
requests.append(
SampleRequest(prompt=prompt,
prompt_len=input_len,
expected_output_len=args.output_len,
schema=schema,
completion=completion))
return requests
def evaluate(ret, args):
def _eval_correctness_json(expected, actual):
# extract json string from string using regex
import re
actual = actual.replace('\n', '').replace(' ', '').strip()
try:
actual = re.search(r'\{.*\}', actual).group()
actual = json.loads(actual)
except Exception:
return False
return True
def _eval_correctness_choice(expected, actual):
return actual in args.choice
def _eval_correctness_regex(expected, actual):
import re
return re.match(args.regex, actual) is not None
def _eval_correctness(expected, actual):
if args.structure_type == 'json':
return _eval_correctness_json(expected, actual)
elif args.structure_type == 'regex':
return _eval_correctness_regex(expected, actual)
elif args.structure_type == 'choice':
return _eval_correctness_choice(expected, actual)
else:
return None
scores = []
for res in ret:
score = _eval_correctness(res['expected'], res['generated'])
res['correctness'] = score
scores.append(score)
not_none_scores = [score for score in scores if score is not None]
return (sum(not_none_scores) / len(not_none_scores) *
100) if len(not_none_scores) > 0 else None
def main(args: argparse.Namespace):
print(args)
random.seed(args.seed)
# async engine is working for 'regex', 'choice' and 'grammar'
if args.dataset == 'grammar':
args.structure_type = 'grammar'
args.async_engine = False
elif args.dataset == 'regex':
args.structure_type = 'regex'
args.async_engine = False
elif args.dataset == 'choice':
args.structure_type = 'choice'
args.async_engine = False
else:
args.structure_type = 'json'
if args.no_guided_decoding:
args.guided_decoding_ratio = 0
if args.save_results:
result_file_name = f'{args.guided_decoding_ratio}guided'
result_file_name += f"_{args.model.split('/')[-1]}"
result_file_name += f"_{args.dataset}"
result_file_name += f"_{args.num_prompts}"
result_file_name += f"_out{args.output_len}"
result_file_name += f"_async{args.async_engine}"
result_file_name += f"_warmup{args.warmup}"
result_file_name += f"_chunkedprefill{args.enable_chunked_prefill}"
result_file_name += ".txt"
else:
result_file_name = None
# Synthesize a prompt with the given input length.
tokenizer = AutoTokenizer.from_pretrained(
args.tokenizer, trust_remote_code=args.trust_remote_code)
requests = sample_requests(tokenizer, args)
if args.async_engine:
engine_args = AsyncEngineArgs.from_cli_args(args)
elapsed_time, ret, (first_latency, next_latency) = uvloop.run(
run_vllm_async(requests, engine_args, args.n,
args.guided_decoding_ratio, args.warmup,
args.disable_frontend_multiprocessing))
else:
engine_args = EngineArgs.from_cli_args(args)
elapsed_time, ret = run_vllm(requests, engine_args, args.n,
args.guided_decoding_ratio, args.warmup)
first_latency, next_latency = None, None
score = evaluate(ret, args)
total_num_tokens = sum(request.prompt_len + request.expected_output_len
for request in requests)
total_output_tokens = sum(request.expected_output_len
for request in requests)
if first_latency is not None:
latency_breakdown = "\nFirst token latency(msecs):\n"
latency_breakdown += f"{first_latency.describe()}"
latency_breakdown += "\nNext token latency(msecs):\n"
latency_breakdown += f"{next_latency.describe()}"
print(
f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
f"{total_output_tokens / elapsed_time:.2f} output tokens/s",
f"Correct rate is {score} %",
f"{latency_breakdown if first_latency is not None else ''}")
# Output JSON results if specified
if args.output_json or result_file_name:
results = {
"elapsed_time": elapsed_time,
"num_requests": len(requests),
"total_num_tokens": total_num_tokens,
"total_output_tokens": total_output_tokens,
"requests_per_second": len(requests) / elapsed_time,
"tokens_per_second": f"{total_num_tokens / elapsed_time:.2f}",
"output_tokens_per_second":
f"{total_output_tokens / elapsed_time:.2f}",
"correct_rate(%)": score
}
results = {"outputs": ret, **results}
if first_latency is not None:
results["first_token_latency(msecs)"] = first_latency.describe(
).to_dict()
results["next_token_latency(msecs)"] = next_latency.describe(
).to_dict()
if args.output_json:
with open(args.output_json, "w") as f:
json.dump(results, f, indent=4)
elif result_file_name:
with open(result_file_name, "w") as f:
json.dump(results, f, indent=4)
if __name__ == "__main__":
parser = FlexibleArgumentParser(description="Benchmark guided decoding.")
parser = AsyncEngineArgs.add_cli_args(parser)
parser.add_argument("--output-len",
type=int,
default=512,
help="Output length for each request. Overrides the "
"output length from the dataset.")
parser.add_argument(
"--dataset",
default='json',
choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench'])
parser.add_argument("--json_schema_path",
type=str,
default=None,
help="Path to json schema.")
parser.add_argument("--n",
type=int,
default=1,
help="Number of generated sequences per prompt.")
parser.add_argument("--num-prompts",
type=int,
default=10,
help="Number of prompts to process.")
parser.add_argument(
'--output-json',
type=str,
default=None,
help='Path to save the throughput results in JSON format.')
parser.add_argument("--async-engine",
action='store_true',
default=False,
help="Use vLLM async engine rather than LLM class.")
parser.add_argument("--no-guided-decoding",
action='store_true',
default=False,
help="Whether to disable JSON decoding or not.")
parser.add_argument("--guided-decoding-ratio",
type=float,
default=1.0,
help="Ratio of Guided Decoding requests")
parser.add_argument("--disable-frontend-multiprocessing",
action='store_true',
default=False,
help="Disable decoupled async engine frontend.")
parser.add_argument("--warmup",
action="store_true",
default=False,
help="Run warmup prompts before benchmark.")
parser.add_argument("--save-results",
action="store_true",
default=False,
help="save output results.")
args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model
main(args)

View File

@ -1,14 +1,17 @@
# SPDX-License-Identifier: Apache-2.0
"""Benchmark the latency of processing a single batch of requests."""
import argparse
import dataclasses
import json
import os
import time
from pathlib import Path
from typing import List, Optional
from typing import Any, Optional
import numpy as np
import torch
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
from tqdm import tqdm
from vllm import LLM, SamplingParams
@ -18,6 +21,18 @@ from vllm.sampling_params import BeamSearchParams
from vllm.utils import FlexibleArgumentParser
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
results: dict[str, Any]) -> None:
pt_records = convert_to_pytorch_benchmark_format(
args=args,
metrics={"latency": results["latencies"]},
extra_info={k: results[k]
for k in ["avg_latency", "percentiles"]})
if pt_records:
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
write_to_json(pt_file, pt_records)
def main(args: argparse.Namespace):
print(args)
@ -26,6 +41,10 @@ def main(args: argparse.Namespace):
# NOTE(woosuk): If the request cannot be processed in a single batch,
# the engine will automatically process the request in multiple batches.
llm = LLM(**dataclasses.asdict(engine_args))
assert llm.llm_engine.model_config.max_model_len >= (
args.input_len +
args.output_len), ("Please ensure that max_model_len is greater than"
" the sum of input_len and output_len.")
sampling_params = SamplingParams(
n=args.n,
@ -33,12 +52,13 @@ def main(args: argparse.Namespace):
top_p=1.0,
ignore_eos=True,
max_tokens=args.output_len,
detokenize=not args.disable_detokenize,
)
print(sampling_params)
dummy_prompt_token_ids = np.random.randint(10000,
size=(args.batch_size,
args.input_len))
dummy_prompts: List[PromptType] = [{
dummy_prompts: list[PromptType] = [{
"prompt_token_ids": batch
} for batch in dummy_prompt_token_ids.tolist()]
@ -54,7 +74,8 @@ def main(args: argparse.Namespace):
beam_width=args.n,
max_tokens=args.output_len,
ignore_eos=True,
))
),
)
def run_to_completion(profile_dir: Optional[str] = None):
if profile_dir:
@ -64,7 +85,8 @@ def main(args: argparse.Namespace):
torch.profiler.ProfilerActivity.CUDA,
],
on_trace_ready=torch.profiler.tensorboard_trace_handler(
str(profile_dir))) as p:
str(profile_dir)),
) as p:
llm_generate()
print(p.key_averages().table(sort_by="self_cuda_time_total"))
else:
@ -81,9 +103,8 @@ def main(args: argparse.Namespace):
if args.profile:
profile_dir = args.profile_result_dir
if not profile_dir:
profile_dir = Path(
"."
) / "vllm_benchmark_result" / f"latency_result_{time.time()}"
profile_dir = (Path(".") / "vllm_benchmark_result" /
f"latency_result_{time.time()}")
print(f"Profiling (results will be saved to '{profile_dir}')...")
run_to_completion(profile_dir=profile_dir)
return
@ -95,9 +116,9 @@ def main(args: argparse.Namespace):
latencies = np.array(latencies)
percentages = [10, 25, 50, 75, 90, 99]
percentiles = np.percentile(latencies, percentages)
print(f'Avg latency: {np.mean(latencies)} seconds')
print(f"Avg latency: {np.mean(latencies)} seconds")
for percentage, percentile in zip(percentages, percentiles):
print(f'{percentage}% percentile latency: {percentile} seconds')
print(f"{percentage}% percentile latency: {percentile} seconds")
# Output JSON results if specified
if args.output_json:
@ -108,43 +129,57 @@ def main(args: argparse.Namespace):
}
with open(args.output_json, "w") as f:
json.dump(results, f, indent=4)
save_to_pytorch_benchmark_format(args, results)
if __name__ == '__main__':
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description='Benchmark the latency of processing a single batch of '
'requests till completion.')
parser.add_argument('--input-len', type=int, default=32)
parser.add_argument('--output-len', type=int, default=128)
parser.add_argument('--batch-size', type=int, default=8)
parser.add_argument('--n',
type=int,
default=1,
help='Number of generated sequences per prompt.')
parser.add_argument('--use-beam-search', action='store_true')
parser.add_argument('--num-iters-warmup',
type=int,
default=10,
help='Number of iterations to run for warmup.')
parser.add_argument('--num-iters',
description="Benchmark the latency of processing a single batch of "
"requests till completion.")
parser.add_argument("--input-len", type=int, default=32)
parser.add_argument("--output-len", type=int, default=128)
parser.add_argument("--batch-size", type=int, default=8)
parser.add_argument(
"--n",
type=int,
default=1,
help="Number of generated sequences per prompt.",
)
parser.add_argument("--use-beam-search", action="store_true")
parser.add_argument(
"--num-iters-warmup",
type=int,
default=10,
help="Number of iterations to run for warmup.",
)
parser.add_argument("--num-iters",
type=int,
default=30,
help='Number of iterations to run.')
help="Number of iterations to run.")
parser.add_argument(
'--profile',
action='store_true',
help='profile the generation process of a single batch')
"--profile",
action="store_true",
help="profile the generation process of a single batch",
)
parser.add_argument(
'--profile-result-dir',
"--profile-result-dir",
type=str,
default=None,
help=('path to save the pytorch profiler output. Can be visualized '
'with ui.perfetto.dev or Tensorboard.'))
help=("path to save the pytorch profiler output. Can be visualized "
"with ui.perfetto.dev or Tensorboard."),
)
parser.add_argument(
'--output-json',
"--output-json",
type=str,
default=None,
help='Path to save the latency results in JSON format.')
help="Path to save the latency results in JSON format.",
)
parser.add_argument(
"--disable-detokenize",
action="store_true",
help=("Do not detokenize responses (i.e. do not include "
"detokenization time in the latency measurement)"),
)
parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()

View File

@ -31,7 +31,7 @@ import dataclasses
import json
import random
import time
from typing import List, Optional, Tuple
from typing import Optional
from transformers import PreTrainedTokenizerBase
@ -77,9 +77,9 @@ def sample_requests_from_dataset(
dataset_path: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
input_length_range: Tuple[int, int],
input_length_range: tuple[int, int],
fixed_output_len: Optional[int],
) -> List[Request]:
) -> list[Request]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
@ -99,7 +99,7 @@ def sample_requests_from_dataset(
assert min_len >= 0 and max_len >= min_len, "input_length_range too small"
# Filter out sequences that are too long or too short
filtered_requests: List[Request] = []
filtered_requests: list[Request] = []
for i in range(len(dataset)):
if len(filtered_requests) == num_requests:
@ -122,10 +122,10 @@ def sample_requests_from_dataset(
def sample_requests_from_random(
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
input_length_range: Tuple[int, int],
input_length_range: tuple[int, int],
fixed_output_len: Optional[int],
prefix_len: int,
) -> List[Request]:
) -> list[Request]:
requests = []
prefix_token_ids = sample_tokens(tokenizer, prefix_len)
@ -144,9 +144,9 @@ def sample_requests_from_random(
return requests
def repeat_and_sort_requests(requests: List[Request],
def repeat_and_sort_requests(requests: list[Request],
repeat_count: int,
sort: bool = False) -> List[str]:
sort: bool = False) -> list[str]:
repeated_requests = requests * repeat_count
if sort:
repeated_requests.sort(key=lambda x: x[1])
@ -194,7 +194,9 @@ def main(args):
llm = LLM(**dataclasses.asdict(engine_args))
sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
sampling_params = SamplingParams(temperature=0,
max_tokens=args.output_len,
detokenize=not args.disable_detokenize)
print("Testing filtered requests")
prompts = repeat_and_sort_requests(filtered_requests,
@ -243,6 +245,12 @@ if __name__ == "__main__":
"subtract this length when filtering prompts. Only used "
"when dataset-path is not provided.",
)
parser.add_argument(
'--disable-detokenize',
action='store_true',
help=("Do not detokenize responses (i.e. do not include "
"detokenization time in the latency measurement)"),
)
parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()

View File

@ -5,7 +5,7 @@ import dataclasses
import json
import random
import time
from typing import List, Optional, Tuple
from typing import Optional
from transformers import AutoTokenizer, PreTrainedTokenizerBase
@ -13,12 +13,17 @@ from vllm.engine.arg_utils import EngineArgs
from vllm.utils import FlexibleArgumentParser
#Select a equi-probable random priority
def get_random_flag():
return 0 if random.random() < 0.5 else 1
def sample_requests(
dataset_path: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int],
) -> List[Tuple[str, int, int]]:
) -> list[tuple[str, int, int, int]]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
@ -35,7 +40,7 @@ def sample_requests(
random.shuffle(dataset)
# Filter out sequences that are too long or too short
filtered_dataset: List[Tuple[str, int, int]] = []
filtered_dataset: list[tuple[str, int, int]] = []
for i in range(len(dataset)):
if len(filtered_dataset) == num_requests:
break
@ -55,8 +60,7 @@ def sample_requests(
# Prune too long sequences.
continue
#Select a equi-probable random priority
priority = 0 if random.random() < 0.5 else 1
priority = get_random_flag()
filtered_dataset.append((prompt, prompt_len, output_len, priority))
@ -64,13 +68,20 @@ def sample_requests(
def run_vllm(
requests: List[Tuple[str, int, int]],
requests: list[tuple[str, int, int]],
n: int,
engine_args: EngineArgs,
disable_detokenize: bool = False,
) -> float:
from vllm import LLM, SamplingParams
llm = LLM(**dataclasses.asdict(engine_args))
assert all(
llm.llm_engine.model_config.max_model_len >= (request[1] + request[2])
for request in requests), (
"Please ensure that max_model_len is greater than the sum of"
" input_len and output_len for all requests.")
# Add the requests to the engine.
prompts = []
sampling_params = []
@ -85,6 +96,7 @@ def run_vllm(
top_p=1.0,
ignore_eos=True,
max_tokens=output_len,
detokenize=not disable_detokenize,
))
start = time.perf_counter()
@ -103,15 +115,16 @@ def main(args: argparse.Namespace):
if args.dataset is None:
# Synthesize a prompt with the given input length.
prompt = "hi" * (args.input_len - 1)
requests = [(prompt, args.input_len, args.output_len)
for _ in range(args.num_prompts)]
requests = [(prompt, args.input_len, args.output_len,
get_random_flag()) for _ in range(args.num_prompts)]
else:
requests = sample_requests(args.dataset, args.num_prompts, tokenizer,
args.output_len)
if args.backend == "vllm":
elapsed_time = run_vllm(requests, args.n,
EngineArgs.from_cli_args(args))
EngineArgs.from_cli_args(args),
args.disable_detokenize)
else:
raise ValueError(f"Unknown backend: {args.backend}")
total_num_tokens = sum(prompt_len + output_len
@ -164,6 +177,12 @@ if __name__ == "__main__":
type=str,
default=None,
help='Path to save the throughput results in JSON format.')
parser.add_argument(
'--disable-detokenize',
action='store_true',
help=("Do not detokenize responses (i.e. do not include "
"detokenization time in the latency measurement)"),
)
parser = EngineArgs.add_cli_args(parser)
args = parser.parse_args()

View File

@ -25,23 +25,20 @@ On the client side, run:
"""
import argparse
import asyncio
import base64
import gc
import io
import json
import os
import random
import time
import warnings
from collections.abc import AsyncGenerator, Iterable
from dataclasses import dataclass
from datetime import datetime
from typing import Any, AsyncGenerator, Collection, Dict, List, Optional, Tuple
from typing import Any, Optional
import numpy as np
from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
RequestFuncOutput)
from datasets import load_dataset
from PIL.Image import Image
from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase
@ -55,6 +52,11 @@ try:
except ImportError:
from argparse import ArgumentParser as FlexibleArgumentParser
from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
RandomDataset, SampleRequest, ShareGPTDataset,
SonnetDataset, VisionArenaDataset)
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@ -70,326 +72,48 @@ class BenchmarkMetrics:
mean_ttft_ms: float
median_ttft_ms: float
std_ttft_ms: float
percentiles_ttft_ms: List[Tuple[float, float]]
percentiles_ttft_ms: list[tuple[float, float]]
mean_tpot_ms: float
median_tpot_ms: float
std_tpot_ms: float
percentiles_tpot_ms: List[Tuple[float, float]]
percentiles_tpot_ms: list[tuple[float, float]]
mean_itl_ms: float
median_itl_ms: float
std_itl_ms: float
percentiles_itl_ms: List[Tuple[float, float]]
percentiles_itl_ms: list[tuple[float, float]]
# E2EL stands for end-to-end latency per request.
# It is the time taken on the client side from sending
# a request to receiving a complete response.
mean_e2el_ms: float
median_e2el_ms: float
std_e2el_ms: float
percentiles_e2el_ms: List[Tuple[float, float]]
def sample_sharegpt_requests(
dataset_path: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int] = None,
) -> List[Tuple[str, int, int, None]]:
# Load the dataset.
with open(dataset_path, encoding='utf-8') as f:
dataset = json.load(f)
# Filter out the conversations with less than 2 turns.
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
# Only keep the first two turns of each conversation.
dataset = [(data["conversations"][0]["value"],
data["conversations"][1]["value"]) for data in dataset]
# Shuffle the dataset.
random.shuffle(dataset)
# Filter out sequences that are too long or too short
filtered_dataset: List[Tuple[str, int, int]] = []
for i in range(len(dataset)):
if len(filtered_dataset) == num_requests:
break
# Tokenize the prompts and completions.
prompt = dataset[i][0]
prompt_token_ids = tokenizer(prompt).input_ids
completion = dataset[i][1]
completion_token_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_token_ids)
output_len = len(completion_token_ids
) if fixed_output_len is None else fixed_output_len
if prompt_len < 4 or (fixed_output_len is None and output_len < 4):
# Prune too short sequences.
continue
if prompt_len > 1024 or prompt_len + output_len > 2048:
# Prune too long sequences.
continue
filtered_dataset.append((prompt, prompt_len, output_len, None))
return filtered_dataset
def sample_sonnet_requests(
dataset_path: str,
num_requests: int,
input_len: int,
output_len: int,
prefix_len: int,
tokenizer: PreTrainedTokenizerBase,
) -> List[Tuple[str, str, int, int, None]]:
assert (
input_len > prefix_len
), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
# Load the dataset.
with open(dataset_path, encoding='utf-8') as f:
poem_lines = f.readlines()
# Tokenize the poem lines.
poem_token_ids = tokenizer(poem_lines).input_ids
average_poem_len = sum(
len(token_ids) for token_ids in poem_token_ids) / len(poem_token_ids)
# Base prefix for all requests.
base_prompt = "Pick as many lines as you can from these poem lines:\n"
base_message = [{
"role": "user",
"content": base_prompt,
}]
base_prompt_formatted = tokenizer.apply_chat_template(
base_message, add_generation_prompt=True, tokenize=False)
base_prompt_offset = len(tokenizer(base_prompt_formatted).input_ids)
assert (
input_len > base_prompt_offset
), f"Please set 'args.sonnet-input-len' higher than {base_prompt_offset}."
num_input_lines = round(
(input_len - base_prompt_offset) / average_poem_len)
# First approximately `prefix_len` number of tokens in the
# prompt are fixed poem lines.
assert (
prefix_len > base_prompt_offset
), f"Please set 'args.sonnet-prefix-len' higher than {base_prompt_offset}."
num_prefix_lines = round(
(prefix_len - base_prompt_offset) / average_poem_len)
prefix_lines = poem_lines[:num_prefix_lines]
# Sample the rest of lines per request.
sampled_requests: List[Tuple[str, int, int]] = []
for _ in range(num_requests):
num_lines_needed = num_input_lines - num_prefix_lines
sampled_lines = "".join(prefix_lines +
random.choices(poem_lines, k=num_lines_needed))
prompt = f"{base_prompt}{sampled_lines}"
message = [
{
"role": "user",
"content": prompt,
},
]
prompt_formatted = tokenizer.apply_chat_template(
message, add_generation_prompt=True, tokenize=False)
prompt_len = len(tokenizer(prompt_formatted).input_ids)
sampled_requests.append(
(prompt, prompt_formatted, prompt_len, output_len, None))
return sampled_requests
def sample_vision_arena_requests(
dataset,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int] = None,
) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
sampled_requests: List[Tuple[str, int, int, Dict[str,
Collection[str]]]] = []
for data in dataset:
if len(sampled_requests) == num_requests:
break
prompt = data["turns"][0][0]['content']
prompt_token_ids = tokenizer(prompt).input_ids
if fixed_output_len is None:
# Default max output len is set to 128
print("--hf-output-len is not provided. Using default value 128.")
fixed_output_len = 128
prompt_len = len(prompt_token_ids)
output_len = fixed_output_len
assert isinstance(
data["images"][0],
Image), ("Input image format must be `PIL.Image.Image`, "
f"given {type(data['image'])}.")
image: Image = data["images"][0]
image = image.convert("RGB")
image_data = io.BytesIO()
image.save(image_data, format='JPEG')
image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
mm_content = {
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_base64}"
},
}
sampled_requests.append((prompt, prompt_len, output_len, mm_content))
return sampled_requests
def sample_hf_requests(
dataset_path: str,
dataset_subset: Optional[str],
dataset_split: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
random_seed: int,
fixed_output_len: Optional[int] = None,
) -> List[Tuple[str, str, int, Optional[Dict[str, Collection[str]]]]]:
# Special case for vision_arena dataset
if dataset_path == 'lmarena-ai/vision-arena-bench-v0.1' \
and dataset_subset is None:
assert dataset_split == "train"
dataset = load_dataset(dataset_path,
name=dataset_subset,
split=dataset_split,
streaming=True)
dataset = dataset.shuffle(seed=random_seed)
return sample_vision_arena_requests(dataset, num_requests, tokenizer,
fixed_output_len)
dataset = load_dataset(dataset_path,
name=dataset_subset,
split=dataset_split,
streaming=True)
assert "conversations" in dataset.features, (
"HF Dataset must have 'conversations' column.")
filter_func = lambda x: len(x["conversations"]) >= 2
filtered_dataset = dataset.shuffle(seed=random_seed).filter(filter_func)
sampled_requests: List[Tuple[str, int, int, Dict[str,
Collection[str]]]] = []
for data in filtered_dataset:
if len(sampled_requests) == num_requests:
break
# Tokenize the prompts and completions.
prompt = data["conversations"][0]["value"]
prompt_token_ids = tokenizer(prompt).input_ids
completion = data["conversations"][1]["value"]
completion_token_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_token_ids)
output_len = len(completion_token_ids
) if fixed_output_len is None else fixed_output_len
if fixed_output_len is None and (prompt_len < 4 or output_len < 4):
# Prune too short sequences.
continue
if fixed_output_len is None and \
(prompt_len > 1024 or prompt_len + output_len > 2048):
# Prune too long sequences.
continue
if "image" in data and isinstance(data["image"], Image):
image: Image = data["image"]
image = image.convert("RGB")
image_data = io.BytesIO()
image.save(image_data, format='JPEG')
image_base64 = base64.b64encode(
image_data.getvalue()).decode("utf-8")
mm_content = {
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{image_base64}"
},
}
elif "image" in data and isinstance(data["image"], str):
if (data["image"].startswith("http://") or \
data["image"].startswith("file://")):
image_url = data["image"]
else:
image_url = f"file://{data['image']}"
mm_content = {
"type": "image_url",
"image_url": {
"url": image_url
},
}
else:
mm_content = None
sampled_requests.append((prompt, prompt_len, output_len, mm_content))
return sampled_requests
def sample_random_requests(
prefix_len: int,
input_len: int,
output_len: int,
num_prompts: int,
range_ratio: float,
tokenizer: PreTrainedTokenizerBase,
) -> List[Tuple[str, int, int]]:
prefix_token_ids = np.random.randint(0,
tokenizer.vocab_size,
size=prefix_len).tolist()
input_lens = np.random.randint(
int(input_len * range_ratio),
input_len + 1,
size=num_prompts,
)
output_lens = np.random.randint(
int(output_len * range_ratio),
output_len + 1,
size=num_prompts,
)
offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
input_requests = []
for i in range(num_prompts):
prompt = tokenizer.decode(prefix_token_ids +
[(offsets[i] + i + j) % tokenizer.vocab_size
for j in range(input_lens[i])])
input_requests.append((prompt, int(prefix_len + input_lens[i]),
int(output_lens[i]), None))
return input_requests
percentiles_e2el_ms: list[tuple[float, float]]
async def get_request(
input_requests: List[Tuple[str, int, int]],
input_requests: list[SampleRequest],
request_rate: float,
burstiness: float = 1.0,
) -> AsyncGenerator[Tuple[str, int, int], None]:
) -> AsyncGenerator[SampleRequest, None]:
"""
Asynchronously generates requests at a specified rate
Asynchronously generates requests at a specified rate
with OPTIONAL burstiness.
Args:
input_requests:
A list of input requests, each represented as a tuple.
request_rate:
input_requests:
A list of input requests, each represented as a SampleRequest.
request_rate:
The rate at which requests are generated (requests/s).
burstiness (optional):
The burstiness factor of the request generation.
burstiness (optional):
The burstiness factor of the request generation.
Only takes effect when request_rate is not inf.
Default value is 1, which follows a Poisson process.
Otherwise, the request intervals follow a gamma distribution.
A lower burstiness value (0 < burstiness < 1) results
in more bursty requests, while a higher burstiness value
A lower burstiness value (0 < burstiness < 1) results
in more bursty requests, while a higher burstiness value
(burstiness > 1) results in a more uniform arrival of requests.
"""
input_requests = iter(input_requests)
input_requests: Iterable[SampleRequest] = iter(input_requests)
# Calculate scale parameter theta to maintain the desired request_rate.
assert burstiness > 0, (
@ -411,23 +135,23 @@ async def get_request(
def calculate_metrics(
input_requests: List[Tuple[str, int, int]],
outputs: List[RequestFuncOutput],
input_requests: list[SampleRequest],
outputs: list[RequestFuncOutput],
dur_s: float,
tokenizer: PreTrainedTokenizerBase,
selected_percentile_metrics: List[str],
selected_percentiles: List[float],
goodput_config_dict: Dict[str, float],
) -> Tuple[BenchmarkMetrics, List[int]]:
actual_output_lens: List[int] = []
selected_percentile_metrics: list[str],
selected_percentiles: list[float],
goodput_config_dict: dict[str, float],
) -> tuple[BenchmarkMetrics, list[int]]:
actual_output_lens: list[int] = []
total_input = 0
completed = 0
good_completed = 0
itls: List[float] = []
tpots: List[float] = []
all_tpots: List[float] = []
ttfts: List[float] = []
e2els: List[float] = []
itls: list[float] = []
tpots: list[float] = []
all_tpots: list[float] = []
ttfts: list[float] = []
e2els: list[float] = []
for i in range(len(outputs)):
if outputs[i].success:
output_len = outputs[i].output_tokens
@ -442,7 +166,7 @@ def calculate_metrics(
tokenizer(outputs[i].generated_text,
add_special_tokens=False).input_ids)
actual_output_lens.append(output_len)
total_input += input_requests[i][1]
total_input += input_requests[i].prompt_len
tpot = 0
if output_len > 1:
latency_minus_ttft = outputs[i].latency - outputs[i].ttft
@ -525,18 +249,18 @@ async def benchmark(
model_id: str,
model_name: str,
tokenizer: PreTrainedTokenizerBase,
input_requests: List[Tuple[str, int, int]],
input_requests: list[SampleRequest],
logprobs: Optional[int],
best_of: int,
request_rate: float,
burstiness: float,
disable_tqdm: bool,
profile: bool,
selected_percentile_metrics: List[str],
selected_percentiles: List[str],
selected_percentile_metrics: list[str],
selected_percentiles: list[float],
ignore_eos: bool,
goodput_config_dict: Dict[str, float],
goodput_config_dict: dict[str, float],
max_concurrency: Optional[int],
lora_modules: Optional[Iterable[str]],
):
if backend in ASYNC_REQUEST_FUNCS:
request_func = ASYNC_REQUEST_FUNCS[backend]
@ -544,12 +268,16 @@ async def benchmark(
raise ValueError(f"Unknown backend: {backend}")
print("Starting initial single prompt test run...")
test_prompt, test_prompt_len, test_output_len, test_mm_content = (
input_requests[0])
test_prompt, test_prompt_len, test_output_len, test_mm_content = \
input_requests[0].prompt, input_requests[0].prompt_len, \
input_requests[0].expected_output_len, \
input_requests[0].multi_modal_data
if backend != "openai-chat" and test_mm_content is not None:
# multi-modal benchmark is only available on OpenAI Chat backend.
raise ValueError(
"Multi-modal content is only supported on 'openai-chat' backend.")
assert test_mm_content is None or isinstance(test_mm_content, dict)
test_input = RequestFuncInput(
model=model_id,
model_name=model_name,
@ -558,10 +286,10 @@ async def benchmark(
prompt_len=test_prompt_len,
output_len=test_output_len,
logprobs=logprobs,
best_of=best_of,
multi_modal_content=test_mm_content,
ignore_eos=ignore_eos,
)
test_output = await request_func(request_func_input=test_input)
if not test_output.success:
raise ValueError(
@ -570,6 +298,12 @@ async def benchmark(
else:
print("Initial test run completed. Starting main benchmark run...")
if lora_modules:
# For each input request, choose a LoRA module at random.
lora_modules = iter(
[random.choice(lora_modules) \
for _ in range(len(input_requests))])
if profile:
print("Starting profiler...")
profile_input = RequestFuncInput(model=model_id,
@ -579,7 +313,6 @@ async def benchmark(
prompt_len=test_prompt_len,
output_len=test_output_len,
logprobs=logprobs,
best_of=best_of,
multi_modal_content=test_mm_content,
ignore_eos=ignore_eos)
profile_output = await request_func(request_func_input=profile_input)
@ -613,24 +346,30 @@ async def benchmark(
pbar=pbar)
benchmark_start_time = time.perf_counter()
tasks: List[asyncio.Task] = []
tasks: list[asyncio.Task] = []
async for request in get_request(input_requests, request_rate, burstiness):
prompt, prompt_len, output_len, mm_content = request
request_func_input = RequestFuncInput(model=model_id,
model_name=model_name,
prompt, prompt_len, output_len, mm_content = request.prompt, \
request.prompt_len, request.expected_output_len, \
request.multi_modal_data
req_model_id, req_model_name = model_id, model_name
if lora_modules:
req_lora_module = next(lora_modules)
req_model_id, req_model_name = req_lora_module, req_lora_module
request_func_input = RequestFuncInput(model=req_model_id,
model_name=req_model_name,
prompt=prompt,
api_url=api_url,
prompt_len=prompt_len,
output_len=output_len,
logprobs=logprobs,
best_of=best_of,
multi_modal_content=mm_content,
ignore_eos=ignore_eos)
tasks.append(
asyncio.create_task(
limited_request_func(request_func_input=request_func_input,
pbar=pbar)))
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
if profile:
print("Stopping profiler...")
@ -641,7 +380,6 @@ async def benchmark(
prompt_len=test_prompt_len,
output_len=test_output_len,
logprobs=logprobs,
best_of=best_of,
)
profile_output = await request_func(request_func_input=profile_input)
if profile_output.success:
@ -775,6 +513,31 @@ def parse_goodput(slo_pairs):
return goodput_config_dict
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
results: dict[str, Any],
file_name: str) -> None:
metrics = [
"median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms",
"mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms",
"median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms"
]
# These raw data might be useful, but they are rather big. They can be added
# later if needed
ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
pt_records = convert_to_pytorch_benchmark_format(
args=args,
metrics={k: [results[k]]
for k in metrics},
extra_info={
k: results[k]
for k in results if k not in metrics and k not in ignored_metrics
})
if pt_records:
# Don't use json suffix here as we don't want CI to pick it up
pt_file = f"{os.path.splitext(file_name)[0]}.pytorch.json"
write_to_json(pt_file, pt_records)
def main(args: argparse.Namespace):
print(args)
random.seed(args.seed)
@ -797,81 +560,77 @@ def main(args: argparse.Namespace):
tokenizer_mode=tokenizer_mode,
trust_remote_code=args.trust_remote_code)
if args.dataset is not None:
warnings.warn(
"The '--dataset' argument will be deprecated in the next "
"release. Please use '--dataset-name' and "
"'--dataset-path' in the future runs.",
stacklevel=2)
input_requests = sample_sharegpt_requests(
dataset_path=args.dataset,
num_requests=args.num_prompts,
tokenizer=tokenizer,
fixed_output_len=args.sharegpt_output_len,
)
if args.dataset_name is None:
raise ValueError(
"Please specify '--dataset-name' and the corresponding "
"'--dataset-path' if required.")
elif args.dataset_name == "sharegpt":
input_requests = sample_sharegpt_requests(
dataset_path=args.dataset_path,
num_requests=args.num_prompts,
tokenizer=tokenizer,
fixed_output_len=args.sharegpt_output_len,
)
elif args.dataset_name == "sonnet":
# Do not format the prompt, pass to message directly
if args.dataset_name == "sonnet":
dataset = SonnetDataset(dataset_path=args.dataset_path)
# For the "sonnet" dataset, formatting depends on the backend.
if args.backend == "openai-chat":
input_requests = sample_sonnet_requests(
dataset_path=args.dataset_path,
num_requests=args.num_prompts,
input_len=args.sonnet_input_len,
output_len=args.sonnet_output_len,
prefix_len=args.sonnet_prefix_len,
tokenizer=tokenizer,
)
input_requests = [(prompt, prompt_len, output_len, None)
for prompt, prompt_formatted, prompt_len,
output_len, _ in input_requests]
input_requests = dataset.sample(num_requests=args.num_prompts,
input_len=args.sonnet_input_len,
output_len=args.sonnet_output_len,
prefix_len=args.sonnet_prefix_len,
tokenizer=tokenizer,
return_prompt_formatted=False)
else:
assert (
tokenizer.chat_template or tokenizer.default_chat_template
), "Tokenizer/model must have chat template for sonnet dataset."
input_requests = sample_sonnet_requests(
dataset_path=args.dataset_path,
num_requests=args.num_prompts,
input_len=args.sonnet_input_len,
output_len=args.sonnet_output_len,
prefix_len=args.sonnet_prefix_len,
tokenizer=tokenizer,
)
input_requests = [(prompt_formatted, prompt_len, output_len, None)
for prompt, prompt_formatted, prompt_len,
output_len, _ in input_requests]
assert tokenizer.chat_template or tokenizer.default_chat_template, (
"Tokenizer/model must have chat template for sonnet dataset.")
input_requests = dataset.sample(num_requests=args.num_prompts,
input_len=args.sonnet_input_len,
output_len=args.sonnet_output_len,
prefix_len=args.sonnet_prefix_len,
tokenizer=tokenizer,
return_prompt_formatted=True)
elif args.dataset_name == "hf":
input_requests = sample_hf_requests(
# Choose between VisionArenaDataset
# and HuggingFaceDataset based on provided parameters.
dataset_class = (VisionArenaDataset if args.dataset_path
== VisionArenaDataset.VISION_ARENA_DATASET_PATH
and args.hf_subset is None else HuggingFaceDataset)
input_requests = dataset_class(
dataset_path=args.dataset_path,
dataset_subset=args.hf_subset,
dataset_split=args.hf_split,
).sample(
num_requests=args.num_prompts,
tokenizer=tokenizer,
random_seed=args.seed,
fixed_output_len=args.hf_output_len,
)
elif args.dataset_name == "random":
input_requests = sample_random_requests(
prefix_len=args.random_prefix_len,
input_len=args.random_input_len,
output_len=args.random_output_len,
num_prompts=args.num_prompts,
range_ratio=args.random_range_ratio,
tokenizer=tokenizer,
output_len=args.hf_output_len,
)
else:
raise ValueError(f"Unknown dataset: {args.dataset_name}")
# For datasets that follow a similar structure, use a mapping.
dataset_mapping = {
"sharegpt":
lambda: ShareGPTDataset(random_seed=args.seed,
dataset_path=args.dataset_path).sample(
tokenizer=tokenizer,
num_requests=args.num_prompts,
output_len=args.sharegpt_output_len,
),
"burstgpt":
lambda: BurstGPTDataset(random_seed=args.seed,
dataset_path=args.dataset_path).
sample(tokenizer=tokenizer, num_requests=args.num_prompts),
"random":
lambda: RandomDataset(dataset_path=args.dataset_path).sample(
tokenizer=tokenizer,
num_requests=args.num_prompts,
prefix_len=args.random_prefix_len,
input_len=args.random_input_len,
output_len=args.random_output_len,
range_ratio=args.random_range_ratio,
)
}
try:
input_requests = dataset_mapping[args.dataset_name]()
except KeyError as err:
raise ValueError(f"Unknown dataset: {args.dataset_name}") from err
goodput_config_dict = check_goodput_args(args)
# Avoid GC processing "static" data - reduce pause times.
@ -888,7 +647,6 @@ def main(args: argparse.Namespace):
tokenizer=tokenizer,
input_requests=input_requests,
logprobs=args.logprobs,
best_of=args.best_of,
request_rate=args.request_rate,
burstiness=args.burstiness,
disable_tqdm=args.disable_tqdm,
@ -900,11 +658,12 @@ def main(args: argparse.Namespace):
ignore_eos=args.ignore_eos,
goodput_config_dict=goodput_config_dict,
max_concurrency=args.max_concurrency,
lora_modules=args.lora_modules,
))
# Save config and results to json
if args.save_result:
result_json: Dict[str, Any] = {}
result_json: dict[str, Any] = {}
# Setup
current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
@ -912,7 +671,6 @@ def main(args: argparse.Namespace):
result_json["backend"] = backend
result_json["model_id"] = model_id
result_json["tokenizer_id"] = tokenizer_id
result_json["best_of"] = args.best_of
result_json["num_prompts"] = args.num_prompts
# Metadata
@ -926,6 +684,15 @@ def main(args: argparse.Namespace):
"Invalid metadata format. Please use KEY=VALUE format."
)
if not args.save_detailed:
# Remove fields with too many data points
for field in [
"input_lens", "output_lens", "ttfts", "itls",
"generated_texts", "errors"
]:
if field in result_json:
del result_json[field]
# Traffic
result_json["request_rate"] = (args.request_rate if args.request_rate
< float("inf") else "inf")
@ -946,6 +713,7 @@ def main(args: argparse.Namespace):
file_name = os.path.join(args.result_dir, file_name)
with open(file_name, "w", encoding='utf-8') as outfile:
json.dump(result_json, outfile)
save_to_pytorch_benchmark_format(args, result_json, file_name)
if __name__ == "__main__":
@ -963,7 +731,8 @@ if __name__ == "__main__":
default=None,
help="Server or API base url if not using http host and port.",
)
parser.add_argument("--host", type=str, default="localhost")
# Use 127.0.0.1 here instead of localhost to force the use of ipv4
parser.add_argument("--host", type=str, default="127.0.0.1")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument(
"--endpoint",
@ -971,18 +740,11 @@ if __name__ == "__main__":
default="/v1/completions",
help="API endpoint.",
)
parser.add_argument(
"--dataset",
type=str,
default=None,
help="Path to the ShareGPT dataset, will be deprecated in the "
"next release.",
)
parser.add_argument(
"--dataset-name",
type=str,
default="sharegpt",
choices=["sharegpt", "sonnet", "random", "hf"],
choices=["sharegpt", "burstgpt", "sonnet", "random", "hf"],
help="Name of the dataset to benchmark on.",
)
parser.add_argument("--dataset-path",
@ -1015,13 +777,6 @@ if __name__ == "__main__":
help=
"Name or path of the tokenizer, if not using the default tokenizer.", # noqa: E501
)
parser.add_argument(
"--best-of",
type=int,
default=1,
help="Generates `best_of` sequences per prompt and "
"returns the best one.",
)
parser.add_argument("--use-beam-search", action="store_true")
parser.add_argument(
"--num-prompts",
@ -1082,6 +837,12 @@ if __name__ == "__main__":
action="store_true",
help="Specify to save benchmark results to a json file",
)
parser.add_argument(
"--save-detailed",
action="store_true",
help="When saving the results, whether to include per request "
"information such as response, error, ttfs, tpots, etc.",
)
parser.add_argument(
"--metadata",
metavar="KEY=VALUE",
@ -1224,11 +985,12 @@ if __name__ == "__main__":
'--tokenizer-mode',
type=str,
default="auto",
choices=['auto', 'slow', 'mistral'],
choices=['auto', 'slow', 'mistral', 'custom'],
help='The tokenizer mode.\n\n* "auto" will use the '
'fast tokenizer if available.\n* "slow" will '
'always use the slow tokenizer. \n* '
'"mistral" will always use the `mistral_common` tokenizer.')
'"mistral" will always use the `mistral_common` tokenizer. \n*'
'"custom" will use --tokenizer to select the preregistered tokenizer.')
parser.add_argument("--served-model-name",
type=str,
@ -1237,5 +999,13 @@ if __name__ == "__main__":
"If not specified, the model name will be the "
"same as the ``--model`` argument. ")
parser.add_argument("--lora-modules",
nargs='+',
default=None,
help="A subset of LoRA module names passed in when "
"launching the server. For each request, the "
"script chooses a LoRA module at random.")
args = parser.parse_args()
main(args)

View File

@ -1,5 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
r"""Benchmark online serving throughput with guided decoding.
r"""Benchmark online serving throughput with structured outputs.
On the server side, run one of the following commands:
(vLLM OpenAI API server)
@ -9,12 +9,12 @@ On the server side, run one of the following commands:
./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
On the client side, run:
python benchmarks/benchmark_serving.py \
python benchmarks/benchmark_serving_structured_output.py \
--backend <backend> \
--model <your_model> \
--dataset json \
--guided-decoding-ratio 1.0 \
--guided-decoding-backend xgrammar \
--structured-output-ratio 1.0 \
--structured-output-backend xgrammar \
--request-rate 10 \
--num-prompts 1000
@ -24,14 +24,17 @@ On the client side, run:
"""
import argparse
import asyncio
import copy
import dataclasses
import json
import os
import random
import time
import uuid
import warnings
from collections.abc import AsyncGenerator
from dataclasses import dataclass
from typing import AsyncGenerator, List, Optional, Tuple
from typing import Optional
import datasets
import numpy as np
@ -51,6 +54,9 @@ try:
except ImportError:
from argparse import ArgumentParser as FlexibleArgumentParser
from vllm.v1.structured_output.utils import (
has_xgrammar_unsupported_json_features)
MILLISECONDS_TO_SECONDS_CONVERSION = 1000
@ -66,22 +72,22 @@ class BenchmarkMetrics:
mean_ttft_ms: float
median_ttft_ms: float
std_ttft_ms: float
percentiles_ttft_ms: List[Tuple[float, float]]
percentiles_ttft_ms: list[tuple[float, float]]
mean_tpot_ms: float
median_tpot_ms: float
std_tpot_ms: float
percentiles_tpot_ms: List[Tuple[float, float]]
percentiles_tpot_ms: list[tuple[float, float]]
mean_itl_ms: float
median_itl_ms: float
std_itl_ms: float
percentiles_itl_ms: List[Tuple[float, float]]
percentiles_itl_ms: list[tuple[float, float]]
# E2EL stands for end-to-end latency per request.
# It is the time taken on the client side from sending
# a request to receiving a complete response.
mean_e2el_ms: float
median_e2el_ms: float
std_e2el_ms: float
percentiles_e2el_ms: List[Tuple[float, float]]
percentiles_e2el_ms: list[tuple[float, float]]
@dataclasses.dataclass
@ -104,25 +110,44 @@ class SampleRequest:
def sample_requests(tokenizer: PreTrainedTokenizerBase,
args: argparse.Namespace) -> List[SampleRequest]:
if args.dataset == 'json':
args: argparse.Namespace) -> list[SampleRequest]:
if args.dataset == 'json' or args.dataset == 'json-unique':
if args.json_schema_path is None:
dir_path = os.path.dirname(os.path.realpath(__file__))
args.json_schema_path = os.path.join(dir_path,
"structured_schemas",
"structured_schema_1.json")
json_schemas = []
with open(args.json_schema_path) as f:
schema = json.load(f)
prompt = f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501
input_len = len(tokenizer(prompt).input_ids)
print(f"Input length of the prompt: {input_len} tokens")
if args.dataset == 'json-unique':
json_schemas = [
copy.deepcopy(schema) for _ in range(args.num_prompts)
]
for i in range(len(json_schemas)):
json_schemas[i]["properties"][
f"__optional_field_{uuid.uuid4()}"] = {
"type":
"string",
"description":
"An unique optional field to avoid cached schemas"
}
def gen_prompt(index: int):
schema = json_schemas[index % len(json_schemas)]
return f"Generate an example of a user profile given the following schema: {json.dumps(schema)}" # noqa: E501
def get_schema(index: int):
return json_schemas[index % len(json_schemas)]
requests = [
SampleRequest(prompt=prompt,
prompt_len=input_len,
SampleRequest(prompt=gen_prompt(i),
prompt_len=len(tokenizer(gen_prompt(i)).input_ids),
expected_output_len=args.output_len,
schema=schema,
schema=get_schema(i),
structure_type=args.structure_type)
for _ in range(args.num_prompts)
for i in range(args.num_prompts)
]
elif args.dataset == "grammar":
@ -187,10 +212,20 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
]
elif args.dataset == "xgrammar_bench":
requests: List[SampleRequest] = []
requests: list[SampleRequest] = []
dataset = datasets.load_dataset("NousResearch/json-mode-eval",
split="train")
print(f"dataset has {len(dataset)} entries")
full_dataset_len = len(dataset)
def _filter_func(item):
import json
schema = json.loads(item["schema"])
return not has_xgrammar_unsupported_json_features(schema)
dataset = dataset.filter(_filter_func)
num_filtered_out = full_dataset_len - len(dataset)
print(f"dataset has {len(dataset)} entries after filtering "
f"out {num_filtered_out} entries with unsupported features")
len_dataset = len(dataset)
for data_point_idx in range(args.num_prompts):
idx = data_point_idx
@ -214,26 +249,26 @@ def sample_requests(tokenizer: PreTrainedTokenizerBase,
async def get_request(
input_requests: List[SampleRequest],
input_requests: list[SampleRequest],
request_rate: float,
burstiness: float = 1.0,
) -> AsyncGenerator[Tuple[int, SampleRequest], None]:
) -> AsyncGenerator[tuple[int, SampleRequest], None]:
"""
Asynchronously generates requests at a specified rate
Asynchronously generates requests at a specified rate
with OPTIONAL burstiness.
Args:
input_requests:
input_requests:
A list of input requests, each represented as a tuple.
request_rate:
request_rate:
The rate at which requests are generated (requests/s).
burstiness (optional):
The burstiness factor of the request generation.
burstiness (optional):
The burstiness factor of the request generation.
Only takes effect when request_rate is not inf.
Default value is 1, which follows a Poisson process.
Otherwise, the request intervals follow a gamma distribution.
A lower burstiness value (0 < burstiness < 1) results
in more bursty requests, while a higher burstiness value
A lower burstiness value (0 < burstiness < 1) results
in more bursty requests, while a higher burstiness value
(burstiness > 1) results in a more uniform arrival of requests.
"""
input_requests = iter(input_requests)
@ -258,22 +293,23 @@ async def get_request(
def calculate_metrics(
input_requests: List[Tuple[str, int, int]],
outputs: List[RequestFuncOutput],
input_requests: list[tuple[str, int, int]],
outputs: list[RequestFuncOutput],
dur_s: float,
tokenizer: PreTrainedTokenizerBase,
selected_percentile_metrics: List[str],
selected_percentiles: List[float],
) -> Tuple[BenchmarkMetrics, List[int]]:
actual_output_lens: List[int] = []
selected_percentile_metrics: list[str],
selected_percentiles: list[float],
goodput_config_dict: Optional[dict[str, float]] = None,
) -> tuple[BenchmarkMetrics, list[int]]:
actual_output_lens: list[int] = []
total_input = 0
completed = 0
good_completed = 0
itls: List[float] = []
tpots: List[float] = []
all_tpots: List[float] = []
ttfts: List[float] = []
e2els: List[float] = []
itls: list[float] = []
tpots: list[float] = []
all_tpots: list[float] = []
ttfts: list[float] = []
e2els: list[float] = []
for i in range(len(outputs)):
if outputs[i].success:
# We use the tokenizer to count the number of output tokens for all
@ -287,10 +323,10 @@ def calculate_metrics(
total_input += input_requests[i].prompt_len
tpot = 0
if output_len > 1:
tpot = (outputs[i].latency - outputs[i].ttft) / (output_len -
1)
latency_minus_ttft = outputs[i].latency - outputs[i].ttft
tpot = latency_minus_ttft / (output_len - 1)
tpots.append(tpot)
outputs[i].tpot = sum(tpots) / len(tpots) if len(tpots) else 0
outputs[i].tpot = tpot
# Note: if output_len <= 1, we regard tpot as 0 for goodput
all_tpots.append(tpot)
itls += outputs[i].itl
@ -300,6 +336,28 @@ def calculate_metrics(
else:
actual_output_lens.append(0)
if goodput_config_dict:
valid_metrics = []
slo_values = []
if "ttft" in goodput_config_dict:
valid_metrics.append(ttfts)
slo_values.append(goodput_config_dict["ttft"] /
MILLISECONDS_TO_SECONDS_CONVERSION)
if "tpot" in goodput_config_dict:
valid_metrics.append(all_tpots)
slo_values.append(goodput_config_dict["tpot"] /
MILLISECONDS_TO_SECONDS_CONVERSION)
if "e2el" in goodput_config_dict:
valid_metrics.append(e2els)
slo_values.append(goodput_config_dict["e2el"] /
MILLISECONDS_TO_SECONDS_CONVERSION)
for req_metric in zip(*valid_metrics):
is_good_req = all([s >= r for s, r in zip(slo_values, req_metric)])
if is_good_req:
good_completed += 1
if completed == 0:
warnings.warn(
"All requests failed. This is likely due to a misconfiguration "
@ -345,17 +403,18 @@ async def benchmark(
base_url: str,
model_id: str,
tokenizer: PreTrainedTokenizerBase,
input_requests: List[SampleRequest],
input_requests: list[SampleRequest],
request_rate: float,
burstiness: float,
disable_tqdm: bool,
profile: bool,
selected_percentile_metrics: List[str],
selected_percentiles: List[str],
selected_percentile_metrics: list[str],
selected_percentiles: list[str],
ignore_eos: bool,
max_concurrency: Optional[int],
guided_decoding_ratio: float,
guided_decoding_backend: str,
structured_output_ratio: float,
structured_output_backend: str,
goodput_config_dict: Optional[dict[str, float]] = None,
):
if backend in ASYNC_REQUEST_FUNCS:
request_func = ASYNC_REQUEST_FUNCS[backend]
@ -366,16 +425,18 @@ async def benchmark(
extra_body = {}
# Add the schema to the extra_body
extra_body[request.structure_type] = request.schema
# Add the specific guided_decoding_backend
extra_body["guided_decoding_backend"] = guided_decoding_backend
# Add the specific structured_output_backend
extra_body["guided_decoding_backend"] = structured_output_backend
return extra_body
print("Starting initial single prompt test run...")
guided_decoding_req_idx = random.sample(
structured_output_req_idx = random.sample(
range(len(input_requests)),
int(len(input_requests) * guided_decoding_ratio))
int(len(input_requests) * structured_output_ratio))
test_request = input_requests[0]
test_req_extra_body = (prepare_extra_body(test_request)
if 0 in structured_output_req_idx else None)
test_input = RequestFuncInput(
model=model_id,
prompt=test_request.prompt,
@ -383,7 +444,7 @@ async def benchmark(
prompt_len=test_request.prompt_len,
output_len=test_request.expected_output_len,
ignore_eos=ignore_eos,
extra_body=prepare_extra_body(test_request),
extra_body=test_req_extra_body,
)
test_output = await request_func(request_func_input=test_input)
if not test_output.success:
@ -402,7 +463,7 @@ async def benchmark(
prompt_len=test_request.prompt_len,
output_len=test_request.expected_output_len,
ignore_eos=ignore_eos,
extra_body=prepare_extra_body(test_request),
extra_body=test_req_extra_body,
)
profile_output = await request_func(request_func_input=profile_input)
if profile_output.success:
@ -435,12 +496,12 @@ async def benchmark(
pbar=pbar)
benchmark_start_time = time.perf_counter()
tasks: List[asyncio.Task] = []
expected: List[str] = []
tasks: list[asyncio.Task] = []
expected: list[str] = []
async for i, request in get_request(input_requests, request_rate,
burstiness):
extra_body = prepare_extra_body(
request) if i in guided_decoding_req_idx else None
request) if i in structured_output_req_idx else None
request_func_input = RequestFuncInput(
model=model_id,
prompt=request.prompt,
@ -455,7 +516,7 @@ async def benchmark(
asyncio.create_task(
limited_request_func(request_func_input=request_func_input,
pbar=pbar)))
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
if profile:
print("Stopping profiler...")
@ -483,6 +544,7 @@ async def benchmark(
tokenizer=tokenizer,
selected_percentile_metrics=selected_percentile_metrics,
selected_percentiles=selected_percentiles,
goodput_config_dict=goodput_config_dict,
)
print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
@ -494,6 +556,9 @@ async def benchmark(
metrics.total_output))
print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
metrics.request_throughput))
if goodput_config_dict:
print("{:<40} {:<10.2f}".format("Request goodput (req/s):",
metrics.request_goodput))
print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
metrics.output_throughput))
print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):",
@ -617,6 +682,40 @@ def evaluate(ret, args):
100) if len(not_none_scores) > 0 else None
def parse_goodput(slo_pairs):
goodput_config_dict = {}
try:
for slo_pair in slo_pairs:
slo_name, slo_val = slo_pair.split(":")
goodput_config_dict[slo_name] = float(slo_val)
except ValueError as err:
raise argparse.ArgumentTypeError(
"Invalid format found for service level objectives. "
"Specify service level objectives for goodput as \"KEY:VALUE\" "
"pairs, where the key is a metric name, and the value is a "
"number in milliseconds.") from err
return goodput_config_dict
def check_goodput_args(args):
goodput_config_dict = {}
VALID_NAMES = ["ttft", "tpot", "e2el"]
if args.goodput:
goodput_config_dict = parse_goodput(args.goodput)
for slo_name, slo_val in goodput_config_dict.items():
if slo_name not in VALID_NAMES:
raise ValueError(
f"Invalid metric name found, {slo_name}: {slo_val}. "
"The service level objective name should be one of "
f"{str(VALID_NAMES)}. ")
if slo_val < 0:
raise ValueError(
f"Invalid value found, {slo_name}: {slo_val}. "
"The service level objective value should be "
"non-negative.")
return goodput_config_dict
def main(args: argparse.Namespace):
print(args)
random.seed(args.seed)
@ -645,10 +744,10 @@ def main(args: argparse.Namespace):
else:
args.structure_type = 'guided_json'
if args.no_guided_decoding:
args.guided_decoding_ratio = 0
if args.no_structured_output:
args.structured_output_ratio = 0
if args.save_results:
result_file_name = f'{args.guided_decoding_ratio}guided'
result_file_name = f'{args.structured_output_ratio}guided'
result_file_name += f"_{backend}"
result_file_name += f"_{args.request_rate}qps"
result_file_name += f"_{args.model.split('/')[-1]}"
@ -661,6 +760,8 @@ def main(args: argparse.Namespace):
input_requests = sample_requests(tokenizer, args)
goodput_config_dict = check_goodput_args(args)
benchmark_result, ret = asyncio.run(
benchmark(
backend=backend,
@ -679,8 +780,9 @@ def main(args: argparse.Namespace):
],
ignore_eos=args.ignore_eos,
max_concurrency=args.max_concurrency,
guided_decoding_ratio=args.guided_decoding_ratio,
guided_decoding_backend=args.guided_decoding_backend,
structured_output_ratio=args.structured_output_ratio,
structured_output_backend=args.structured_output_backend,
goodput_config_dict=goodput_config_dict,
))
# Save config and results to json
@ -731,7 +833,8 @@ if __name__ == "__main__":
default=None,
help="Server or API base url if not using http host and port.",
)
parser.add_argument("--host", type=str, default="localhost")
# Use 127.0.0.1 here instead of localhost to force the use of ipv4
parser.add_argument("--host", type=str, default="127.0.0.1")
parser.add_argument("--port", type=int, default=8000)
parser.add_argument(
"--endpoint",
@ -739,10 +842,12 @@ if __name__ == "__main__":
default="/v1/completions",
help="API endpoint.",
)
parser.add_argument(
"--dataset",
default='json',
choices=['json', 'grammar', 'regex', 'choice', 'xgrammar_bench'])
parser.add_argument("--dataset",
default='json',
choices=[
'json', 'json-unique', 'grammar', 'regex',
'choice', 'xgrammar_bench'
])
parser.add_argument("--json_schema_path",
type=str,
default=None,
@ -864,19 +969,31 @@ if __name__ == "__main__":
"Default value is \"99\". "
"Use \"--percentile-metrics\" to select metrics.",
)
parser.add_argument("--no-guided-decoding",
parser.add_argument(
"--goodput",
nargs="+",
required=False,
help="Specify service level objectives for goodput as \"KEY:VALUE\" "
"pairs, where the key is a metric name, and the value is in "
"milliseconds. Multiple \"KEY:VALUE\" pairs can be provided, "
"separated by spaces. Allowed request level metric names are "
"\"ttft\", \"tpot\", \"e2el\". For more context on the definition of "
"goodput, refer to DistServe paper: https://arxiv.org/pdf/2401.09670 "
"and the blog: https://hao-ai-lab.github.io/blogs/distserve")
parser.add_argument("--no-structured-output",
action='store_true',
default=False,
help="Whether to disable JSON decoding or not.")
parser.add_argument("--guided-decoding-ratio",
parser.add_argument("--structured-output-ratio",
type=float,
default=1.0,
help="Ratio of Guided Decoding requests")
parser.add_argument("--guided-decoding-backend",
help="Ratio of Structured Outputs requests")
parser.add_argument("--structured-output-backend",
type=str,
choices=["outlines", "lm-format-enforcer", "xgrammar"],
default="xgrammar",
help="Backend to use for guided decoding")
help="Backend to use for structured outputs")
args = parser.parse_args()
main(args)

View File

@ -3,14 +3,18 @@
import argparse
import dataclasses
import json
import os
import random
import time
from functools import cache
from typing import Dict, List, Optional, Tuple
import warnings
from typing import Any, Optional, Union
import torch
import uvloop
from PIL import Image
from benchmark_dataset import (BurstGPTDataset, HuggingFaceDataset,
RandomDataset, SampleRequest, ShareGPTDataset,
SonnetDataset, VisionArenaDataset)
from benchmark_utils import convert_to_pytorch_benchmark_format, write_to_json
from tqdm import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer,
PreTrainedTokenizerBase)
@ -18,163 +22,35 @@ from transformers import (AutoModelForCausalLM, AutoTokenizer,
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args)
from vllm.inputs import TextPrompt
from vllm.inputs import TextPrompt, TokensPrompt
from vllm.lora.request import LoRARequest
from vllm.lora.utils import get_adapter_absolute_path
from vllm.multimodal import MultiModalDataDict
from vllm.outputs import RequestOutput
from vllm.sampling_params import BeamSearchParams
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
from vllm.utils import FlexibleArgumentParser, merge_async_iterators
@dataclasses.dataclass
class SampleRequest:
"""A class representing a single inference request for benchmarking.
Attributes:
prompt: The input text prompt for the model.
prompt_len: The length of the prompt in tokens.
expected_output_len: The expected length of the output in tokens.
multi_modal_data: Optional dictionary containing multi-modal data (e.g.
images).
lora_request: Optional LoRARequest specifying the LoRA to use.
"""
prompt: str
prompt_len: int
expected_output_len: int
multi_modal_data: Optional[MultiModalDataDict] = None
lora_request: Optional[LoRARequest] = None
def _get_prompt_for_image_model(question: str, *, model: str) -> str:
"""Prepend and append special tokens around the question to form a prompt.
Args:
question: The input question text to wrap with special tokens
model: The name of the model being used, to determine which special
tokens to add
Returns:
The formatted prompt string with appropriate special tokens for the
model
Raises:
ValueError: If an unsupported model name is provided
"""
model = model.lower()
if "pixtral" in model:
return f"<s>[INST]{question}\n[IMG][/INST]"
raise ValueError(f"Unsupported model {model}")
@cache
def lora_path_on_disk(lora_path: str) -> str:
return get_adapter_absolute_path(lora_path)
lora_tokenizer_cache: Dict[int, AnyTokenizer] = {}
def get_random_lora_request(
args: argparse.Namespace
) -> Tuple[LoRARequest, Optional[AnyTokenizer]]:
global lora_tokenizer_cache
lora_id = random.randint(1, args.max_loras)
lora_request = LoRARequest(lora_name=str(lora_id),
lora_int_id=lora_id,
lora_path=lora_path_on_disk(args.lora_path))
if lora_id not in lora_tokenizer_cache:
lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
return lora_request, lora_tokenizer_cache[lora_id]
def sample_requests(tokenizer: PreTrainedTokenizerBase,
args: argparse.Namespace) -> List[SampleRequest]:
dataset_path: str = args.dataset
num_requests: int = args.num_prompts
fixed_output_len: Optional[int] = args.output_len
model: str = args.model
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
# Load the dataset.
with open(dataset_path) as f:
dataset = json.load(f)
# Filter out the conversations with less than 2 turns.
dataset = [data for data in dataset if len(data["conversations"]) >= 2]
# Shuffle the dataset.
random.shuffle(dataset)
# Filter out sequences that are too long or too short
filtered_dataset: List[SampleRequest] = []
for data in tqdm(dataset,
total=len(filtered_dataset),
desc="sampling requests"):
if len(filtered_dataset) == num_requests:
break
# Only keep the first two turns of each conversation.
prompt = data["conversations"][0]["value"]
completion = data["conversations"][1]["value"]
multi_modal_data: Optional[MultiModalDataDict] = None
if "image" in data:
multi_modal_data = multi_modal_data or {}
image_path = data["image"]
# TODO(vllm-project/vllm/issues/9778): Support multiple images.
assert isinstance(image_path,
str), "Only support single image input"
try:
multi_modal_data["image"] = Image.open(image_path).convert(
"RGB")
except FileNotFoundError:
# Ignore datapoint where asset is missing
continue
prompt = _get_prompt_for_image_model(question=prompt, model=model)
request_tokenizer = tokenizer
lora_request: Optional[LoRARequest] = None
if args.enable_lora:
lora_request, lora_tokenizer = get_random_lora_request(args)
if lora_tokenizer:
request_tokenizer = lora_tokenizer
# Tokenize the prompts and completions.
prompt_token_ids = request_tokenizer(prompt).input_ids
completion_token_ids = request_tokenizer(completion).input_ids
prompt_len = len(prompt_token_ids)
output_len = len(completion_token_ids
) if fixed_output_len is None else fixed_output_len
if prompt_len < 4 or output_len < 4:
# Prune too short sequences.
continue
if prompt_len > 1024 or prompt_len + output_len > 2048:
# Prune too long sequences.
continue
filtered_dataset.append(
SampleRequest(prompt=prompt,
prompt_len=prompt_len,
expected_output_len=output_len,
multi_modal_data=multi_modal_data,
lora_request=lora_request))
return filtered_dataset
def run_vllm(
requests: List[SampleRequest],
requests: list[SampleRequest],
n: int,
engine_args: EngineArgs,
) -> float:
disable_detokenize: bool = False,
) -> tuple[float, Optional[list[RequestOutput]]]:
from vllm import LLM, SamplingParams
llm = LLM(**dataclasses.asdict(engine_args))
assert all(
llm.llm_engine.model_config.max_model_len >= (
request.prompt_len + request.expected_output_len)
for request in requests), (
"Please ensure that max_model_len is greater than the sum of"
" prompt_len and expected_output_len for all requests.")
# Add the requests to the engine.
prompts: List[TextPrompt] = []
sampling_params: List[SamplingParams] = []
prompts: list[Union[TextPrompt, TokensPrompt]] = []
sampling_params: list[SamplingParams] = []
for request in requests:
prompts.append(
TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
multi_modal_data=request.multi_modal_data)
if "prompt_token_ids" in request.prompt else \
TextPrompt(prompt=request.prompt,
multi_modal_data=request.multi_modal_data))
sampling_params.append(
@ -184,19 +60,21 @@ def run_vllm(
top_p=1.0,
ignore_eos=True,
max_tokens=request.expected_output_len,
detokenize=not disable_detokenize,
))
lora_requests: Optional[List[LoRARequest]] = None
lora_requests: Optional[list[LoRARequest]] = None
if engine_args.enable_lora:
lora_requests = [request.lora_request for request in requests]
use_beam_search = False
outputs = None
if not use_beam_search:
start = time.perf_counter()
llm.generate(prompts,
sampling_params,
lora_request=lora_requests,
use_tqdm=True)
outputs = llm.generate(prompts,
sampling_params,
lora_request=lora_requests,
use_tqdm=True)
end = time.perf_counter()
else:
assert lora_requests is None, "BeamSearch API does not support LoRA"
@ -214,26 +92,75 @@ def run_vllm(
ignore_eos=True,
))
end = time.perf_counter()
return end - start
return end - start, outputs
def run_vllm_chat(
requests: list[SampleRequest],
n: int,
engine_args: EngineArgs,
disable_detokenize: bool = False) -> tuple[float, list[RequestOutput]]:
"""
Run vLLM chat benchmark. This function is recommended ONLY for benchmarking
multimodal models as it properly handles multimodal inputs and chat
formatting. For non-multimodal models, use run_vllm() instead.
"""
from vllm import LLM, SamplingParams
llm = LLM(**dataclasses.asdict(engine_args))
assert all(
llm.llm_engine.model_config.max_model_len >= (
request.prompt_len + request.expected_output_len)
for request in requests), (
"Please ensure that max_model_len is greater than the sum of "
"prompt_len and expected_output_len for all requests.")
prompts = []
sampling_params: list[SamplingParams] = []
for request in requests:
prompts.append(request.prompt)
sampling_params.append(
SamplingParams(
n=n,
temperature=1.0,
top_p=1.0,
ignore_eos=True,
max_tokens=request.expected_output_len,
detokenize=not disable_detokenize,
))
start = time.perf_counter()
outputs = llm.chat(prompts, sampling_params, use_tqdm=True)
end = time.perf_counter()
return end - start, outputs
async def run_vllm_async(
requests: List[SampleRequest],
requests: list[SampleRequest],
n: int,
engine_args: AsyncEngineArgs,
disable_frontend_multiprocessing: bool = False,
disable_detokenize: bool = False,
) -> float:
from vllm import SamplingParams
async with build_async_engine_client_from_engine_args(
engine_args, disable_frontend_multiprocessing) as llm:
assert all(
llm.model_config.max_model_len >= (request.prompt_len +
request.expected_output_len)
for request in requests), (
"Please ensure that max_model_len is greater than the sum of"
" prompt_len and expected_output_len for all requests.")
# Add the requests to the engine.
prompts: List[TextPrompt] = []
sampling_params: List[SamplingParams] = []
lora_requests: List[Optional[LoRARequest]] = []
prompts: list[Union[TextPrompt, TokensPrompt]] = []
sampling_params: list[SamplingParams] = []
lora_requests: list[Optional[LoRARequest]] = []
for request in requests:
prompts.append(
TokensPrompt(prompt_token_ids=request.prompt["prompt_token_ids"],
multi_modal_data=request.multi_modal_data)
if "prompt_token_ids" in request.prompt else \
TextPrompt(prompt=request.prompt,
multi_modal_data=request.multi_modal_data))
sampling_params.append(
@ -243,6 +170,7 @@ async def run_vllm_async(
top_p=1.0,
ignore_eos=True,
max_tokens=request.expected_output_len,
detokenize=not disable_detokenize,
))
lora_requests.append(request.lora_request)
@ -263,12 +191,13 @@ async def run_vllm_async(
def run_hf(
requests: List[SampleRequest],
requests: list[SampleRequest],
model: str,
tokenizer: PreTrainedTokenizerBase,
n: int,
max_batch_size: int,
trust_remote_code: bool,
disable_detokenize: bool = False,
) -> float:
llm = AutoModelForCausalLM.from_pretrained(
model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
@ -279,7 +208,7 @@ def run_hf(
pbar = tqdm(total=len(requests))
start = time.perf_counter()
batch: List[str] = []
batch: list[str] = []
max_prompt_len = 0
max_output_len = 0
for i in range(len(requests)):
@ -308,8 +237,9 @@ def run_hf(
use_cache=True,
max_new_tokens=max_output_len,
)
# Include the decoding time.
tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
if not disable_detokenize:
# Include the decoding time.
tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
pbar.update(len(batch))
# Clear the batch.
@ -321,7 +251,7 @@ def run_hf(
def run_mii(
requests: List[SampleRequest],
requests: list[SampleRequest],
model: str,
tensor_parallel_size: int,
output_len: int,
@ -338,58 +268,86 @@ def run_mii(
return end - start
def save_to_pytorch_benchmark_format(args: argparse.Namespace,
results: dict[str, Any]) -> None:
pt_records = convert_to_pytorch_benchmark_format(
args=args,
metrics={
"requests_per_second": [results["requests_per_second"]],
"tokens_per_second": [results["tokens_per_second"]],
},
extra_info={
k: results[k]
for k in ["elapsed_time", "num_requests", "total_num_tokens"]
})
if pt_records:
# Don't use json suffix here as we don't want CI to pick it up
pt_file = f"{os.path.splitext(args.output_json)[0]}.pytorch.json"
write_to_json(pt_file, pt_records)
def get_requests(args, tokenizer):
# Common parameters for all dataset types.
common_kwargs = {
"dataset_path": args.dataset_path,
"random_seed": args.seed,
}
sample_kwargs = {
"tokenizer": tokenizer,
"lora_path": args.lora_path,
"max_loras": args.max_loras,
"num_requests": args.num_prompts,
"input_len": args.input_len,
"output_len": args.output_len,
}
if args.dataset_path is None or args.dataset_name == "random":
sample_kwargs["range_ratio"] = args.random_range_ratio
sample_kwargs["prefix_len"] = args.prefix_len
dataset_cls = RandomDataset
elif args.dataset_name == "sharegpt":
dataset_cls = ShareGPTDataset
if args.backend == "vllm-chat":
sample_kwargs["enable_multimodal_chat"] = True
elif args.dataset_name == "sonnet":
assert tokenizer.chat_template or tokenizer.default_chat_template, (
"Tokenizer/model must have chat template for sonnet dataset.")
dataset_cls = SonnetDataset
sample_kwargs["prefix_len"] = args.prefix_len
sample_kwargs["return_prompt_formatted"] = True
elif args.dataset_name == "burstgpt":
dataset_cls = BurstGPTDataset
elif args.dataset_name == "hf":
if args.backend != "vllm-chat":
raise ValueError(
"hf datasets only are supported by vllm-chat backend")
# Choose between VisionArenaDataset and HuggingFaceDataset based on
# provided parameters.
dataset_cls = (VisionArenaDataset if args.dataset_path
== VisionArenaDataset.VISION_ARENA_DATASET_PATH
and args.hf_subset is None else HuggingFaceDataset)
common_kwargs['dataset_subset'] = args.hf_subset
common_kwargs['dataset_split'] = args.hf_split
sample_kwargs["enable_multimodal_chat"] = True
else:
raise ValueError(f"Unknown dataset name: {args.dataset_name}")
# Remove None values
sample_kwargs = {k: v for k, v in sample_kwargs.items() if v is not None}
return dataset_cls(**common_kwargs).sample(**sample_kwargs)
def main(args: argparse.Namespace):
if args.seed is None:
args.seed = 0
print(args)
random.seed(args.seed)
# Sample the requests.
tokenizer = AutoTokenizer.from_pretrained(
args.tokenizer, trust_remote_code=args.trust_remote_code)
if args.dataset is None:
vocab_size = tokenizer.vocab_size
requests = []
for _ in range(args.num_prompts):
request_tokenizer = tokenizer
lora_request: Optional[LoRARequest] = None
if args.enable_lora:
lora_request, lora_tokenizer = get_random_lora_request(args)
if lora_tokenizer:
request_tokenizer = lora_tokenizer
# Synthesize a prompt with the given input length.
candidate_ids = [
random.randint(0, vocab_size - 1)
for _ in range(args.input_len)
]
# As tokenizer may add additional tokens like BOS, we need to try
# different lengths to get the desired input length.
for _ in range(5): # Max attempts to correct
candidate_prompt = request_tokenizer.decode(candidate_ids)
tokenized_len = len(request_tokenizer.encode(candidate_prompt))
if tokenized_len == args.input_len:
break
# Adjust length based on difference
diff = args.input_len - tokenized_len
if diff > 0:
candidate_ids.extend([
random.randint(100, vocab_size - 100)
for _ in range(diff)
])
else:
candidate_ids = candidate_ids[:diff]
requests.append(
SampleRequest(prompt=candidate_prompt,
prompt_len=args.input_len,
expected_output_len=args.output_len,
lora_request=lora_request))
else:
requests = sample_requests(tokenizer, args)
requests = get_requests(args, tokenizer)
is_multi_modal = any(request.multi_modal_data is not None
for request in requests)
request_outputs: Optional[list[RequestOutput]] = None
if args.backend == "vllm":
if args.async_engine:
elapsed_time = uvloop.run(
@ -398,31 +356,59 @@ def main(args: argparse.Namespace):
args.n,
AsyncEngineArgs.from_cli_args(args),
args.disable_frontend_multiprocessing,
args.disable_detokenize,
))
else:
elapsed_time = run_vllm(requests, args.n,
EngineArgs.from_cli_args(args))
elapsed_time, request_outputs = run_vllm(
requests, args.n, EngineArgs.from_cli_args(args),
args.disable_detokenize)
elif args.backend == "hf":
assert args.tensor_parallel_size == 1
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
args.hf_max_batch_size, args.trust_remote_code)
args.hf_max_batch_size, args.trust_remote_code,
args.disable_detokenize)
elif args.backend == "mii":
elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
args.output_len)
elif args.backend == "vllm-chat":
elapsed_time, request_outputs = run_vllm_chat(
requests, args.n, EngineArgs.from_cli_args(args),
args.disable_detokenize)
else:
raise ValueError(f"Unknown backend: {args.backend}")
total_num_tokens = sum(request.prompt_len + request.expected_output_len
for request in requests)
total_output_tokens = sum(request.expected_output_len
for request in requests)
if is_multi_modal:
print("\033[91mWARNING\033[0m: Multi-modal request detected. The "
if request_outputs:
# Note: with the vllm and vllm-chat backends,
# we have request_outputs, which we use to count tokens.
total_prompt_tokens = 0
total_output_tokens = 0
for ro in request_outputs:
if not isinstance(ro, RequestOutput):
continue
total_prompt_tokens += len(
ro.prompt_token_ids) if ro.prompt_token_ids else 0
total_output_tokens += sum(
len(o.token_ids) for o in ro.outputs if o)
total_num_tokens = total_prompt_tokens + total_output_tokens
else:
total_num_tokens = sum(r.prompt_len + r.expected_output_len
for r in requests)
total_output_tokens = sum(r.expected_output_len for r in requests)
total_prompt_tokens = total_num_tokens - total_output_tokens
if is_multi_modal and args.backend != "vllm-chat":
print("\033[91mWARNING\033[0m: Multi-modal request with "
f"{args.backend} backend detected. The "
"following metrics are not accurate because image tokens are not"
" counted. See vllm-project/vllm/issues/9778 for details.")
# TODO(vllm-project/vllm/issues/9778): Count molti-modal token length.
# TODO(vllm-project/vllm/issues/9778): Count multi-modal token length.
# vllm-chat backend counts the image tokens now
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
f"{total_output_tokens / elapsed_time:.2f} output tokens/s")
print(f"Total num prompt tokens: {total_prompt_tokens}")
print(f"Total num output tokens: {total_output_tokens}")
# Output JSON results if specified
if args.output_json:
@ -435,20 +421,115 @@ def main(args: argparse.Namespace):
}
with open(args.output_json, "w") as f:
json.dump(results, f, indent=4)
save_to_pytorch_benchmark_format(args, results)
def validate_args(args):
"""
Validate command-line arguments.
"""
# === Deprecation and Defaulting ===
if args.dataset is not None:
warnings.warn(
"The '--dataset' argument will be deprecated in the next release. "
"Please use '--dataset-name' and '--dataset-path' instead.",
stacklevel=2)
args.dataset_path = args.dataset
if not getattr(args, "tokenizer", None):
args.tokenizer = args.model
# === Backend Validation ===
valid_backends = {"vllm", "hf", "mii", "vllm-chat"}
if args.backend not in valid_backends:
raise ValueError(f"Unsupported backend: {args.backend}")
# === Dataset Configuration ===
if not args.dataset and not args.dataset_path:
print(
"When dataset path is not set, it will default to random dataset")
args.dataset_name = 'random'
if args.input_len is None:
raise ValueError("input_len must be provided for a random dataset")
# === Dataset Name Specific Checks ===
# --hf-subset and --hf-split: only used
# when dataset_name is 'hf'
if args.dataset_name != "hf" and (
getattr(args, "hf_subset", None) is not None
or getattr(args, "hf_split", None) is not None):
warnings.warn("--hf-subset and --hf-split will be ignored \
since --dataset-name is not 'hf'.",
stacklevel=2)
elif args.dataset_name == "hf" and args.backend != "vllm-chat":
raise ValueError(
"When --dataset-name is 'hf', backend must be 'vllm-chat'")
# --random-range-ratio: only used when dataset_name is 'random'
if args.dataset_name != 'random' and args.random_range_ratio is not None:
warnings.warn("--random-range-ratio will be ignored since \
--dataset-name is not 'random'.",
stacklevel=2)
# --prefix-len: only used when dataset_name is 'random', 'sonnet', or not
# set.
if args.dataset_name not in {"random", "sonnet", None
} and args.prefix_len is not None:
warnings.warn("--prefix-len will be ignored since --dataset-name\
is not 'random', 'sonnet', or not set.",
stacklevel=2)
# === LoRA Settings ===
if getattr(args, "enable_lora", False) and args.backend != "vllm":
raise ValueError(
"LoRA benchmarking is only supported for vLLM backend")
if getattr(args, "enable_lora", False) and args.lora_path is None:
raise ValueError("LoRA path must be provided when enable_lora is True")
# === Backend-specific Validations ===
if args.backend == "hf" and args.hf_max_batch_size is None:
raise ValueError("HF max batch size is required for HF backend")
if args.backend != "hf" and args.hf_max_batch_size is not None:
raise ValueError("HF max batch size is only for HF backend.")
if args.backend in {"hf", "mii"} and getattr(args, "quantization",
None) is not None:
raise ValueError("Quantization is only for vLLM backend.")
if args.backend == "mii" and args.dtype != "auto":
raise ValueError("dtype must be auto for MII backend.")
if args.backend == "mii" and args.n != 1:
raise ValueError("n must be 1 for MII backend.")
if args.backend == "mii" and args.tokenizer != args.model:
raise ValueError(
"Tokenizer must be the same as the model for MII backend.")
if __name__ == "__main__":
parser = FlexibleArgumentParser(description="Benchmark the throughput.")
parser.add_argument("--backend",
type=str,
choices=["vllm", "hf", "mii"],
choices=["vllm", "hf", "mii", "vllm-chat"],
default="vllm")
parser.add_argument("--dataset",
parser.add_argument(
"--dataset-name",
type=str,
choices=["sharegpt", "random", "sonnet", "burstgpt", "hf"],
help="Name of the dataset to benchmark on.",
default="sharegpt")
parser.add_argument(
"--dataset",
type=str,
default=None,
help="Path to the ShareGPT dataset, will be deprecated in\
the next release. The dataset is expected to "
"be a json in form of list[dict[..., conversations: "
"list[dict[..., value: <prompt_or_response>]]]]")
parser.add_argument("--dataset-path",
type=str,
default=None,
help="Path to the dataset. The dataset is expected to "
"be a json in form of List[Dict[..., conversations: "
"List[Dict[..., value: <prompt_or_response>]]]]")
help="Path to the dataset")
parser.add_argument("--input-len",
type=int,
default=None,
@ -483,6 +564,11 @@ if __name__ == "__main__":
action='store_true',
default=False,
help="Disable decoupled async engine frontend.")
parser.add_argument(
"--disable-detokenize",
action="store_true",
help=("Do not detokenize the response (i.e. do not include "
"detokenization time in the measurement)"))
# LoRA
parser.add_argument(
"--lora-path",
@ -490,43 +576,33 @@ if __name__ == "__main__":
default=None,
help="Path to the lora adapters to use. This can be an absolute path, "
"a relative path, or a Hugging Face model identifier.")
parser.add_argument("--prefix-len",
type=int,
default=None,
help="Number of prefix tokens per request."
"This is for the RandomDataset and SonnetDataset")
# random dataset
parser.add_argument(
"--random-range-ratio",
type=float,
default=None,
help="Range of sampled ratio of input/output length, "
"used only for RandomDataSet.",
)
# hf dtaset
parser.add_argument("--hf-subset",
type=str,
default=None,
help="Subset of the HF dataset.")
parser.add_argument("--hf-split",
type=str,
default=None,
help="Split of the HF dataset.")
parser = AsyncEngineArgs.add_cli_args(parser)
args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model
if args.dataset is None:
assert args.input_len is not None
assert args.output_len is not None
else:
assert args.input_len is None
if args.enable_lora:
assert args.lora_path is not None
if args.backend == "vllm":
if args.hf_max_batch_size is not None:
raise ValueError("HF max batch size is only for HF backend.")
elif args.backend == "hf":
if args.hf_max_batch_size is None:
raise ValueError("HF max batch size is required for HF backend.")
if args.quantization is not None:
raise ValueError("Quantization is only for vLLM backend.")
if args.enable_lora is not None:
raise ValueError("LoRA benchmarking is only supported for vLLM"
" backend")
elif args.backend == "mii":
if args.dtype != "auto":
raise ValueError("dtype must be auto for MII backend.")
if args.n != 1:
raise ValueError("n must be 1 for MII backend.")
if args.quantization is not None:
raise ValueError("Quantization is only for vLLM backend.")
if args.hf_max_batch_size is not None:
raise ValueError("HF max batch size is only for HF backend.")
if args.tokenizer != args.model:
raise ValueError("Tokenizer must be the same as the model for MII "
"backend.")
if args.enable_lora is not None:
raise ValueError("LoRA benchmarking is only supported for vLLM"
" backend")
validate_args(args)
main(args)

View File

@ -0,0 +1,69 @@
# SPDX-License-Identifier: Apache-2.0
import argparse
import json
import math
import os
from typing import Any
def convert_to_pytorch_benchmark_format(args: argparse.Namespace,
metrics: dict[str, list],
extra_info: dict[str, Any]) -> list:
"""
Save the benchmark results in the format used by PyTorch OSS benchmark with
on metric per record
https://github.com/pytorch/pytorch/wiki/How-to-integrate-with-PyTorch-OSS-benchmark-database
"""
records = []
if not os.environ.get("SAVE_TO_PYTORCH_BENCHMARK_FORMAT", False):
return records
for name, benchmark_values in metrics.items():
record = {
"benchmark": {
"name": "vLLM benchmark",
"extra_info": {
"args": vars(args),
},
},
"model": {
"name": args.model,
},
"metric": {
"name": name,
"benchmark_values": benchmark_values,
"extra_info": extra_info,
},
}
tp = record["benchmark"]["extra_info"]["args"].get(
"tensor_parallel_size")
# Save tensor_parallel_size parameter if it's part of the metadata
if not tp and "tensor_parallel_size" in extra_info:
record["benchmark"]["extra_info"]["args"][
"tensor_parallel_size"] = extra_info["tensor_parallel_size"]
records.append(record)
return records
class InfEncoder(json.JSONEncoder):
def clear_inf(self, o: Any):
if isinstance(o, dict):
return {k: self.clear_inf(v) for k, v in o.items()}
elif isinstance(o, list):
return [self.clear_inf(v) for v in o]
elif isinstance(o, float) and math.isinf(o):
return "inf"
return o
def iterencode(self, o: Any, *args, **kwargs) -> Any:
return super().iterencode(self.clear_inf(o), *args, **kwargs)
def write_to_json(filename: str, records: list) -> None:
with open(filename, "w") as f:
json.dump(records, f, cls=InfEncoder)

View File

@ -5,7 +5,8 @@ import copy
import itertools
import pickle as pkl
import time
from typing import Callable, Iterable, List, Tuple
from collections.abc import Iterable
from typing import Callable
import torch
import torch.utils.benchmark as TBenchmark
@ -228,7 +229,7 @@ def print_timers(timers: Iterable[TMeasurement]):
def run(dtype: torch.dtype,
MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
MKNs: Iterable[tuple[int, int, int]]) -> Iterable[TMeasurement]:
results = []
for m, k, n in MKNs:
timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
@ -241,7 +242,7 @@ def run(dtype: torch.dtype,
# output makers
def make_output(data: Iterable[TMeasurement],
MKNs: Iterable[Tuple[int, int, int]],
MKNs: Iterable[tuple[int, int, int]],
base_description: str,
timestamp=None):
print(f"== All Results {base_description} ====")
@ -282,7 +283,7 @@ def run_model_bench(args):
for i, model in enumerate(args.models):
print(f"[{i}] {model}")
def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]:
KNs = []
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
KN[tp_split_dim] = KN[tp_split_dim] // tp_size

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# Cutlass bench utils
from typing import Iterable, Tuple
from collections.abc import Iterable
import torch
@ -27,7 +27,7 @@ def to_fp16(tensor: torch.Tensor) -> torch.Tensor:
def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
k: int) -> Tuple[torch.Tensor, torch.Tensor]:
k: int) -> tuple[torch.Tensor, torch.Tensor]:
a = torch.randn((m, k), device='cuda') * 5
b = torch.randn((n, k), device='cuda').t() * 5
@ -63,7 +63,7 @@ def prune_to_2_4(tensor):
def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
k: int) -> Tuple[torch.Tensor, torch.Tensor]:
k: int) -> tuple[torch.Tensor, torch.Tensor]:
a = torch.randn((m, k), device='cuda') * 5
b = torch.randn((n, k), device='cuda').t() * 5
@ -88,7 +88,7 @@ def make_rand_sparse_tensors(dtype: torch.dtype, m: int, n: int,
def make_n_rand_sparse_tensors(num_tensors: int, dtype: torch.dtype,
m: int, n: int, k: int) -> \
Tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
tuple[Iterable[torch.Tensor], Iterable[torch.Tensor]]:
ABs = []
for _ in range(num_tensors):
b_comp, e, a, b = make_rand_sparse_tensors(dtype, m, n, k)

View File

@ -5,7 +5,8 @@ import copy
import itertools
import pickle as pkl
import time
from typing import Callable, Iterable, List, Optional, Tuple
from collections.abc import Iterable
from typing import Callable, Optional
import torch
import torch.utils.benchmark as TBenchmark
@ -49,7 +50,7 @@ def bench_int8(
n: int,
label: str,
sub_label: str,
bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
"""Benchmark INT8-based kernels."""
assert dtype == torch.int8
a, b = make_rand_tensors(torch.int8, m, n, k)
@ -101,7 +102,7 @@ def bench_fp8(
n: int,
label: str,
sub_label: str,
bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
"""Benchmark FP8-based kernels."""
assert dtype == torch.float8_e4m3fn
a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
@ -180,7 +181,7 @@ def bench(dtype: torch.dtype,
n: int,
label: str,
sub_label: str,
bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
if dtype == torch.int8:
return bench_int8(dtype, m, k, n, label, sub_label, bench_kernels)
if dtype == torch.float8_e4m3fn:
@ -195,8 +196,8 @@ def print_timers(timers: Iterable[TMeasurement]):
def run(dtype: torch.dtype,
MKNs: Iterable[Tuple[int, int, int]],
bench_kernels: Optional[List[str]] = None) -> Iterable[TMeasurement]:
MKNs: Iterable[tuple[int, int, int]],
bench_kernels: Optional[list[str]] = None) -> Iterable[TMeasurement]:
results = []
for m, k, n in MKNs:
timers = bench(dtype,
@ -212,7 +213,7 @@ def run(dtype: torch.dtype,
def make_output(data: Iterable[TMeasurement],
MKNs: Iterable[Tuple[int, int, int]],
MKNs: Iterable[tuple[int, int, int]],
base_description: str,
timestamp=None):
print(f"== All Results {base_description} ====")
@ -248,7 +249,7 @@ def run_model_bench(args):
for i, model in enumerate(args.models):
print(f"[{i}] {model}")
def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]:
KNs = []
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
KN[tp_split_dim] = KN[tp_split_dim] // tp_size

View File

@ -2,9 +2,10 @@
import pickle as pkl
import time
from collections.abc import Iterable
from dataclasses import dataclass
from itertools import product
from typing import Callable, Iterable, List, Optional
from typing import Callable, Optional
import torch
import torch.utils.benchmark as TBenchmark
@ -29,7 +30,7 @@ class bench_params_t:
f'x DT {self.dtype}')
def get_bench_params() -> List[bench_params_t]:
def get_bench_params() -> list[bench_params_t]:
## Test Fixtures
NUM_TOKENS = [2**x for x in range(11)]
HIDDEN_SIZES = list(range(1024, 8129, 1024))

View File

@ -40,7 +40,7 @@ def main(num_tokens: int,
end_time = time.perf_counter()
if profile:
torch.cuda.cudart().cudaProfilerStart()
torch.cuda.cudart().cudaProfilerStop()
return (end_time - start_time) / num_iters
# Warmup.

View File

@ -9,7 +9,7 @@ from dataclasses import dataclass
from enum import Enum, auto
from itertools import product
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Tuple
from typing import Any, Callable, Optional
import torch
import torch.utils.benchmark as TBenchmark
@ -23,6 +23,7 @@ from vllm.lora.ops.triton_ops.bgmv_shrink import bgmv_shrink
from vllm.lora.ops.triton_ops.sgmv_expand import sgmv_expand
from vllm.lora.ops.triton_ops.sgmv_shrink import sgmv_shrink
from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT
from vllm.lora.ops.triton_ops.v1 import V1KernelMeta, v1_expand, v1_shrink
from vllm.utils import FlexibleArgumentParser
DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())
@ -61,15 +62,15 @@ def make_rand_lora_weight_tensor(k: int,
def make_rand_tensors(
a_shape: Tuple[int],
b_shape: Tuple[int],
c_shape: Tuple[int],
a_shape: tuple[int],
b_shape: tuple[int],
c_shape: tuple[int],
a_dtype: torch.dtype,
b_dtype: torch.dtype,
c_dtype: torch.dtype,
num_slices: int,
device: str = "cuda",
) -> Tuple[torch.Tensor, List[torch.Tensor], torch.Tensor]:
) -> tuple[torch.Tensor, list[torch.Tensor], torch.Tensor]:
"""
Make LoRA input/output matrices.
"""
@ -89,7 +90,7 @@ def make_prompt_lora_mapping(num_prompts: int, num_active_loras: int,
sort_by_lora_id: bool,
device: str) -> torch.Tensor:
"""
All prompts are mapped to a Lora ID in range [0, num_active_loras).
All prompts are mapped to a LoRA ID in range [0, num_active_loras).
where 0 refers to first lora, 1 refers to second lora and so on.
"""
assert num_active_loras > 0
@ -135,7 +136,7 @@ def make_token_lora_mapping(num_tokens: int, num_prompts: int,
def ref_group_gemm(ref_out: torch.Tensor, input: torch.Tensor,
lora_weights: List[torch.Tensor],
lora_weights: list[torch.Tensor],
seq_lens_cpu: torch.Tensor,
prompt_lora_mapping_cpu: torch.Tensor, scaling: float,
add_inputs: Optional[bool]):
@ -153,7 +154,6 @@ def ref_group_gemm(ref_out: torch.Tensor, input: torch.Tensor,
result = torch.nn.functional.linear(x, w)
result *= scaling
out_list.append(result)
torch.cat(out_list, dim=0)
cat_result = torch.cat(out_list, dim=0)
@ -172,6 +172,8 @@ class OpType(Enum):
SGMV_EXPAND = auto()
BGMV_EXPAND = auto()
BGMV_EXPAND_SLICE = auto()
V1_SHRINK = auto()
V1_EXPAND = auto()
@staticmethod
def from_str(s: str) -> "OpType":
@ -185,28 +187,43 @@ class OpType(Enum):
return OpType.BGMV_EXPAND
if s.lower() == "bgmv_expand_slice":
return OpType.BGMV_EXPAND_SLICE
if s.lower() == "v1_shrink":
return OpType.V1_SHRINK
if s.lower() == "v1_expand":
return OpType.V1_EXPAND
raise ValueError(f"Unrecognized str {s} to convert to OpType")
def is_shrink_fn(self) -> bool:
return self in [OpType.SGMV_SHRINK, OpType.BGMV_SHRINK]
return self in [
OpType.SGMV_SHRINK, OpType.BGMV_SHRINK, OpType.V1_SHRINK
]
def is_expand_fn(self) -> bool:
return self in [OpType.SGMV_EXPAND, OpType.BGMV_EXPAND]
return self in [
OpType.SGMV_EXPAND, OpType.BGMV_EXPAND, OpType.V1_EXPAND
]
def is_prefill_op(self) -> bool:
return self in [OpType.SGMV_SHRINK, OpType.SGMV_EXPAND]
return self in [
OpType.SGMV_SHRINK, OpType.SGMV_EXPAND, OpType.V1_SHRINK,
OpType.V1_EXPAND
]
def is_decode_op(self) -> bool:
return self in [
OpType.BGMV_SHRINK, OpType.BGMV_EXPAND, OpType.BGMV_EXPAND_SLICE
OpType.BGMV_SHRINK, OpType.BGMV_EXPAND, OpType.BGMV_EXPAND_SLICE,
OpType.V1_SHRINK, OpType.V1_EXPAND
]
def is_expand_slice_fn(self) -> bool:
return self in [OpType.BGMV_EXPAND_SLICE]
def num_slices(self) -> List[int]:
if self in [OpType.SGMV_EXPAND, OpType.SGMV_SHRINK]:
# SGMV kernels supports slices
def num_slices(self) -> list[int]:
if self in [
OpType.SGMV_EXPAND, OpType.SGMV_SHRINK, OpType.V1_SHRINK,
OpType.V1_EXPAND
]:
# SGMV kernels and v1 kernels supports slices
return [1, 2, 3]
if self in [OpType.BGMV_SHRINK, OpType.BGMV_EXPAND]:
return [1]
@ -215,7 +232,7 @@ class OpType(Enum):
raise ValueError(f"Unrecognized OpType {self}")
def mkn(self, batch_size: int, seq_length: int, hidden_size: int,
lora_rank: int) -> Tuple[int, int, int]:
lora_rank: int) -> tuple[int, int, int]:
num_tokens = batch_size * seq_length
if self.is_shrink_fn():
m = num_tokens
@ -230,7 +247,7 @@ class OpType(Enum):
def matmul_dtypes(
self, op_dtype: torch.dtype
) -> Tuple[torch.dtype, torch.dtype, torch.dtype]:
) -> tuple[torch.dtype, torch.dtype, torch.dtype]:
"""
return a type, b type and c type for A x B = C
"""
@ -243,7 +260,7 @@ class OpType(Enum):
def matmul_shapes(
self, batch_size: int, seq_length: int, hidden_size: int,
lora_rank: int, num_loras: int,
num_slices: int) -> Tuple[Tuple[int], Tuple[int], Tuple[int]]:
num_slices: int) -> tuple[tuple[int], tuple[int], tuple[int]]:
"""
Given num_slices, return the shapes of the A, B, and C matrices
in A x B = C, for the op_type
@ -251,11 +268,13 @@ class OpType(Enum):
m, k, n = self.mkn(batch_size, seq_length, hidden_size, lora_rank)
b_shape = (num_loras, n, k) # col-major
if self == OpType.SGMV_SHRINK:
# SGMV shrink supports num_slices inherently in the kernel
if self in [OpType.SGMV_SHRINK, OpType.V1_SHRINK]:
# SGMV shrink and V1 shrink kernels support num_slices inherently
# in the kernel.
return ((m, k), b_shape, (num_slices, m, n))
if self == OpType.SGMV_EXPAND:
# SGMV expand supports num_slices inherently in the kernel
if self in [OpType.SGMV_EXPAND, OpType.V1_EXPAND]:
# SGMV expand and V1 expand kernels support num_slices inherently
# in the kernel
return ((num_slices, m, k), b_shape, (m, n * num_slices))
if self == OpType.BGMV_SHRINK:
return ((m, k), b_shape, (m, n))
@ -268,7 +287,7 @@ class OpType(Enum):
def bench_fn(self) -> Callable:
def emulate_bgmv_expand_slice(kwargs_list: List[Dict[str, Any]]):
def emulate_bgmv_expand_slice(kwargs_list: list[dict[str, Any]]):
for x in kwargs_list:
bgmv_expand_slice(**x)
@ -282,25 +301,30 @@ class OpType(Enum):
return bgmv_expand
if self == OpType.BGMV_EXPAND_SLICE:
return emulate_bgmv_expand_slice
if self == OpType.V1_SHRINK:
return v1_shrink
if self == OpType.V1_EXPAND:
return v1_expand
raise ValueError(f"Unrecognized optype {self}")
def run_ref_group_gemm(self, output: torch.Tensor, input: torch.Tensor,
lora_weights: List[torch.Tensor],
lora_weights: list[torch.Tensor],
**kwargs) -> Callable:
"""Each benchmark operation expected the input, lora_weights and outputs
"""Each benchmark operation expects the input, lora_weights and outputs
in a slightly different format. Refer to self.matmul_shapes().
run_ref_group_gemm accounts for those differences in executing a
reference group gemm for correctness testing.
"""
w_dtype = lora_weights[0].dtype
num_slices = len(lora_weights)
if self == OpType.SGMV_SHRINK:
if self in [OpType.SGMV_SHRINK, OpType.V1_SHRINK]:
for slice_idx in range(num_slices):
ref_group_gemm(ref_out=output[slice_idx, :],
input=input,
lora_weights=lora_weights[slice_idx],
**kwargs)
if self == OpType.SGMV_EXPAND:
elif self in [OpType.SGMV_EXPAND, OpType.V1_EXPAND]:
hidden_size = lora_weights[0].shape[1]
for slice_idx in range(num_slices):
slice_offset = slice_idx * hidden_size
@ -309,19 +333,19 @@ class OpType(Enum):
input=input[slice_idx].clone().to(dtype=w_dtype),
lora_weights=lora_weights[slice_idx],
**kwargs)
if self == OpType.BGMV_SHRINK:
elif self == OpType.BGMV_SHRINK:
assert num_slices == 1
ref_group_gemm(ref_out=output,
input=input,
lora_weights=lora_weights[0],
**kwargs)
if self == OpType.BGMV_EXPAND:
elif self == OpType.BGMV_EXPAND:
assert num_slices == 1
ref_group_gemm(ref_out=output,
input=input.clone().to(dtype=w_dtype),
lora_weights=lora_weights[0],
**kwargs)
if self == OpType.BGMV_EXPAND_SLICE:
elif self == OpType.BGMV_EXPAND_SLICE:
hidden_size = lora_weights[0].shape[1]
for slice_idx in range(num_slices):
slice_offset = slice_idx * hidden_size
@ -330,7 +354,8 @@ class OpType(Enum):
input=input[slice_idx].clone().to(dtype=w_dtype),
lora_weights=lora_weights[slice_idx],
**kwargs)
raise ValueError(f"Unrecognized optype {self}")
else:
raise ValueError(f"Unrecognized optype {self}")
@dataclass
@ -384,13 +409,15 @@ class BenchmarkTensors:
"""
# matmul tensors
input: torch.Tensor
lora_weights_lst: List[torch.Tensor]
lora_weights_lst: list[torch.Tensor]
output: torch.Tensor
# metadata tensors
seq_lens: torch.Tensor
seq_start_loc: torch.Tensor
prompt_lora_mapping: torch.Tensor
token_lora_mapping: torch.Tensor
# v1 kernel metadata
v1_kernel_meta: Optional[V1KernelMeta] = None
def io_types(self) -> str:
return (f"{dtype_to_str(self.input.dtype)}x"
@ -433,10 +460,19 @@ class BenchmarkTensors:
total_tokens, ctx.batch_size, prompt_lora_indices_tensor,
seq_len_tensor, "cpu")
v1_kernel_meta = None
if op_type in [OpType.V1_SHRINK, OpType.V1_EXPAND]:
v1_kernel_meta = V1KernelMeta.make(
max_loras=ctx.num_loras,
max_num_tokens=token_lora_indices_tensor.size(0),
device="cpu")
v1_kernel_meta.prepare_tensors(
token_lora_mapping=token_lora_indices_tensor)
return BenchmarkTensors(input_tensor, lora_weights, output_tensor,
seq_len_tensor, seq_start_loc_tensor,
prompt_lora_indices_tensor,
token_lora_indices_tensor)
token_lora_indices_tensor, v1_kernel_meta)
def sanity_check(self) -> None:
"""
@ -469,7 +505,14 @@ class BenchmarkTensors:
for i in range(len(self.lora_weights_lst)):
self.lora_weights_lst[i] = to_device(self.lora_weights_lst[i])
def metadata(self) -> Tuple[int, int, int]:
# v1 meta
if self.v1_kernel_meta:
for field_name in V1KernelMeta.__dataclass_fields__:
field = getattr(self.v1_kernel_meta, field_name)
assert isinstance(field, torch.Tensor)
setattr(self.v1_kernel_meta, field_name, to_device(field))
def metadata(self) -> tuple[int, int, int]:
"""
Return num_seqs, num_tokens and max_seq_len
"""
@ -505,7 +548,7 @@ class BenchmarkTensors:
self.seq_lens = seq_lens.to(dtype=self.seq_lens.dtype)
self.seq_start_loc = seq_start_loc.to(dtype=self.seq_start_loc.dtype)
def as_sgmv_shrink_kwargs(self) -> Dict[str, Any]:
def as_sgmv_shrink_kwargs(self) -> dict[str, Any]:
self.convert_to_sgmv_benchmark_tensors()
self.sanity_check()
self.to_device(self.input.device)
@ -540,7 +583,7 @@ class BenchmarkTensors:
'scaling': 1.0,
}
def as_sgmv_expand_kwargs(self, add_inputs: bool) -> Dict[str, Any]:
def as_sgmv_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
self.convert_to_sgmv_benchmark_tensors()
self.sanity_check()
@ -578,7 +621,7 @@ class BenchmarkTensors:
'add_inputs': add_inputs,
}
def as_bgmv_shrink_kwargs(self) -> Dict[str, Any]:
def as_bgmv_shrink_kwargs(self) -> dict[str, Any]:
assert len(self.lora_weights_lst) == 1
self.to_device(self.input.device)
@ -634,7 +677,7 @@ class BenchmarkTensors:
'add_inputs': add_inputs
}
def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> Dict[str, Any]:
def as_bgmv_expand_slice_kwargs(self, add_inputs: bool) -> dict[str, Any]:
_, num_tokens, _, num_slices = self.metadata()
# Sanity check shapes
@ -668,9 +711,81 @@ class BenchmarkTensors:
})
return {'kwargs_list': kwargs_list}
def as_v1_shrink_kwargs(self) -> dict[str, Any]:
assert self.v1_kernel_meta is not None
self.sanity_check()
self.to_device(self.input.device)
_, num_tokens, _, num_slices = self.metadata()
# Sanity check matrix shapes.
i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
0].shape, self.output.shape
# Expected input shape [num_tokens, hidden_size]
assert len(i_shape) == 2
assert i_shape[0] == num_tokens
hidden_size = i_shape[1]
# Expected lora weight shape [num_loras, lora_rank, hidden_size]
assert len(lw_shape) == 3
assert lw_shape[2] == hidden_size
lora_rank = lw_shape[1]
# Expected output shape [num_slices, num_tokens, lora_rank]
assert len(o_shape) == 3
assert o_shape == (num_slices, num_tokens, lora_rank)
return {
'inputs': self.input,
'lora_a_weights': self.lora_weights_lst,
'output_tensor': self.output,
'token_lora_mapping': self.v1_kernel_meta.token_lora_mapping,
'token_indices_sorted_by_lora_ids':
self.v1_kernel_meta.token_indices_sorted_by_lora_ids,
'num_tokens_per_lora': self.v1_kernel_meta.num_tokens_per_lora,
'lora_token_start_loc': self.v1_kernel_meta.lora_token_start_loc,
'lora_ids': self.v1_kernel_meta.active_lora_ids,
'scaling': 1.0,
}
def as_v1_expand_kwargs(self, add_inputs: bool) -> dict[str, Any]:
assert self.v1_kernel_meta is not None
self.sanity_check()
self.to_device(self.input.device)
_, num_tokens, _, num_slices = self.metadata()
# Sanity check matrix shapes.
i_shape, lw_shape, o_shape = self.input.shape, self.lora_weights_lst[
0].shape, self.output.shape
# Expected input shape : [num_slices, num_tokens, lora_rank]
assert len(i_shape) == 3
assert i_shape[0] == num_slices
assert i_shape[1] == num_tokens
lora_rank = i_shape[2]
# Expected lora weight shape : [num_lora, hidden_size, lora_rank]
assert len(lw_shape) == 3
assert lw_shape[2] == lora_rank
hidden_size = lw_shape[1]
# Expected output shape : [num_tokens, hidden_size * num_slices]
assert len(o_shape) == 2
assert o_shape == (num_tokens, hidden_size * num_slices)
return {
'inputs': self.input,
'lora_b_weights': self.lora_weights_lst,
'output_tensor': self.output,
'token_lora_mapping': self.v1_kernel_meta.token_lora_mapping,
'token_indices_sorted_by_lora_ids':
self.v1_kernel_meta.token_indices_sorted_by_lora_ids,
'num_tokens_per_lora': self.v1_kernel_meta.num_tokens_per_lora,
'lora_token_start_loc': self.v1_kernel_meta.lora_token_start_loc,
'lora_ids': self.v1_kernel_meta.active_lora_ids,
'offset_start': 0,
'add_inputs': add_inputs,
}
def bench_fn_kwargs(self,
op_type: OpType,
add_inputs: Optional[bool] = None) -> Dict[str, Any]:
add_inputs: Optional[bool] = None) -> dict[str, Any]:
if op_type.is_shrink_fn():
assert add_inputs is None
else:
@ -686,6 +801,10 @@ class BenchmarkTensors:
return self.as_bgmv_expand_kwargs(add_inputs)
if op_type == OpType.BGMV_EXPAND_SLICE:
return self.as_bgmv_expand_slice_kwargs(add_inputs)
if op_type == OpType.V1_SHRINK:
return self.as_v1_shrink_kwargs()
if op_type == OpType.V1_EXPAND:
return self.as_v1_expand_kwargs(add_inputs)
raise ValueError(f"Unrecognized optype {self}")
def test_correctness(self, op_type: OpType,
@ -734,7 +853,7 @@ def bench_optype(ctx: BenchmarkContext,
assert expand_fn_add_inputs is not None
# BenchmarkContext -> BenchmarkTensors
bench_tensors : List[BenchmarkTensors] = \
bench_tensors : list[BenchmarkTensors] = \
[BenchmarkTensors.make(ctx, op_type) for _ in range(arg_pool_size)]
for bt in bench_tensors:
bt.sanity_check()
@ -746,7 +865,7 @@ def bench_optype(ctx: BenchmarkContext,
for bt in bench_tensors
])
# BenchmarkTensors -> Dict (kwargs)
# BenchmarkTensors -> dict (kwargs)
kwargs_list = [
bt.bench_fn_kwargs(op_type, add_inputs=expand_fn_add_inputs)
for bt in bench_tensors
@ -841,7 +960,7 @@ def use_cuda_graph_recommendation() -> str:
"""
def print_timers(timers: List[TMeasurement],
def print_timers(timers: list[TMeasurement],
args: Optional[argparse.Namespace] = None):
compare = TBenchmark.Compare(timers)
compare.print()
@ -861,7 +980,7 @@ def print_timers(timers: List[TMeasurement],
"small num_loras the goal should be to match the torch.mm numbers.")
def run(args: argparse.Namespace, bench_ctxs: List[BenchmarkContext]):
def run(args: argparse.Namespace, bench_ctxs: list[BenchmarkContext]):
if args.cuda_graph_nops is not None:
assert args.cuda_graph_nops > 0
@ -873,12 +992,9 @@ def run(args: argparse.Namespace, bench_ctxs: List[BenchmarkContext]):
timers = []
for bench_ctx in bench_ctxs:
for seq_len in args.seq_lengths:
bench_ops: List[OpType] = []
if seq_len == 1:
# bench all decode ops
bench_ops = [op for op in args.op_types if op.is_decode_op()]
else:
# bench all prefill ops
bench_ops: list[OpType] = args.op_types
if seq_len > 1:
# bench only prefill ops
bench_ops = [op for op in args.op_types if op.is_prefill_op()]
seq_len_timers = []
@ -921,10 +1037,10 @@ def run(args: argparse.Namespace, bench_ctxs: List[BenchmarkContext]):
pickle.dump(timers, f)
def as_benchmark_contexts(hidden_sizes: List[int], lora_ranks: List[int],
args: argparse.Namespace) -> List[BenchmarkContext]:
def as_benchmark_contexts(hidden_sizes: list[int], lora_ranks: list[int],
args: argparse.Namespace) -> list[BenchmarkContext]:
ctxs: List[BenchmarkContext] = []
ctxs: list[BenchmarkContext] = []
for batch_size, hidden_size, lora_rank, num_loras, sort_by_lora_id in product( # noqa
args.batch_sizes, list(hidden_sizes), lora_ranks, args.num_loras,
args.sort_by_lora_id):
@ -954,7 +1070,7 @@ def run_list_bench(args: argparse.Namespace):
f" LoRA Ranks {args.lora_ranks}")
# Get all benchmarking contexts
bench_contexts: List[BenchmarkContext] = as_benchmark_contexts(
bench_contexts: list[BenchmarkContext] = as_benchmark_contexts(
hidden_sizes=args.hidden_sizes, lora_ranks=args.lora_ranks, args=args)
run(args, bench_contexts)
@ -975,7 +1091,7 @@ def run_range_bench(args: argparse.Namespace):
f" LoRA Ranks {lora_ranks}")
# Get all benchmarking contexts
bench_contexts: List[BenchmarkContext] = as_benchmark_contexts(
bench_contexts: list[BenchmarkContext] = as_benchmark_contexts(
hidden_sizes=hidden_sizes, lora_ranks=lora_ranks, args=args)
run(args, bench_contexts)
@ -1002,7 +1118,7 @@ def run_model_bench(args: argparse.Namespace):
f" LoRA Ranks {args.lora_ranks}")
# Get all benchmarking contexts
bench_contexts: List[BenchmarkContext] = as_benchmark_contexts(
bench_contexts: list[BenchmarkContext] = as_benchmark_contexts(
hidden_sizes=hidden_sizes, lora_ranks=args.lora_ranks, args=args)
run(args, bench_contexts)

View File

@ -7,9 +7,10 @@ import math
import os
import pickle as pkl
import time
from collections.abc import Iterable
from dataclasses import dataclass
from itertools import product
from typing import Callable, Iterable, List, Optional, Tuple
from typing import Callable, Optional
import pandas as pd
import torch
@ -44,7 +45,6 @@ def terse_type_name(dt):
torch.float16: "fp16",
torch.int8: "int8",
torch.float8_e4m3fn: "fp8",
torch.bfloat16: "bf16",
torch.float: "float",
torch.int: "int",
}[dt]
@ -102,8 +102,8 @@ def quantize_and_pack(atype: torch.dtype,
return w_ref, w_q, w_s, w_zp
def create_bench_tensors(shape: Tuple[int, int, int], types: TypeConfig,
group_size: Optional[int]) -> List[BenchmarkTensors]:
def create_bench_tensors(shape: tuple[int, int, int], types: TypeConfig,
group_size: Optional[int]) -> list[BenchmarkTensors]:
m, n, k = shape
# we want to make sure that weights don't fit into L2 cache between runs so
@ -114,7 +114,7 @@ def create_bench_tensors(shape: Tuple[int, int, int], types: TypeConfig,
a = rand_data((m, k), types.act_type, scale=5)
benchmark_tensors: List[BenchmarkTensors] = []
benchmark_tensors: list[BenchmarkTensors] = []
for _ in range(num_weights):
w = rand_data((k, n), types.act_type, scale=5)
@ -258,7 +258,7 @@ def machete_create_bench_fn(bt: BenchmarkTensors,
return lambda: ops.machete_mm(
a=bt.a,
b_q=bt.w_q,
b_q=w_q,
b_type=bt.wtype,
b_group_scales=bt.w_g_s,
b_group_zeros=w_g_zp,
@ -276,7 +276,7 @@ def machete_create_bench_fn(bt: BenchmarkTensors,
def bench_fns(label: str, sub_label: str, description: str,
fns: List[Callable]):
fns: list[Callable]):
min_run_time = 1 if not NVTX_PROFILE else 0.1
res = TBenchmark.Timer(
@ -311,7 +311,7 @@ def bench(types: TypeConfig,
n: int,
label: str,
sub_label: str,
sweep_schedules: bool = True) -> List[TMeasurement]:
sweep_schedules: bool = True) -> list[TMeasurement]:
benchmark_tensors = create_bench_tensors((m, n, k), types, group_size)
sub_label += f", L={len(benchmark_tensors)}"
@ -414,12 +414,12 @@ def bench(types: TypeConfig,
# runner
def print_timers(timers: List[TMeasurement]):
def print_timers(timers: list[TMeasurement]):
compare = TBenchmark.Compare(timers)
compare.print()
def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
def run(args, MKNs: Iterable[tuple[int, int, int]]) -> Iterable[TMeasurement]:
types = TypeConfig(
act_type=args.act_type,
weight_type=scalar_types.uint4b8 if args.group_zero_type is None \
@ -431,7 +431,7 @@ def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
token_scale_type=args.token_scale_type,
)
results: List[TMeasurement] = []
results: list[TMeasurement] = []
for m, k, n in MKNs:
timers = bench(types,
args.group_size,
@ -449,8 +449,8 @@ def run(args, MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
# output makers
def make_output(
data: List[TMeasurement],
MKNs: Iterable[Tuple[int, int, int]],
data: list[TMeasurement],
MKNs: Iterable[tuple[int, int, int]],
base_description: str,
timestamp=None,
):
@ -497,7 +497,7 @@ def run_model_bench(args):
for i, model in enumerate(args.models):
print(f"[{i}] {model}")
def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
def model_shapes(model_name: str, tp_size: int) -> list[tuple[int, int]]:
KNs = []
for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
KN[tp_split_dim] = KN[tp_split_dim] // tp_size

View File

@ -1,7 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
from typing import List
import torch
import torch.utils.benchmark as benchmark
from benchmark_shapes import WEIGHT_SHAPES
@ -10,6 +8,8 @@ from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES)
from vllm.model_executor.layers.quantization.utils.allspark_utils import (
ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD, ALLSPARK_SUPPORTED_QUANT_TYPES)
from vllm.model_executor.layers.quantization.utils.marlin_utils import (
GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
MARLIN_SUPPORTED_GROUP_SIZES, query_marlin_supported_quant_types)
@ -18,18 +18,18 @@ from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
from vllm.model_executor.layers.quantization.utils.marlin_utils_test_24 import (
marlin_24_quantize)
from vllm.model_executor.layers.quantization.utils.quant_utils import (
gptq_pack, gptq_quantize_weights, sort_weights)
gptq_pack, gptq_quantize_weights, quantize_weights, sort_weights)
from vllm.scalar_type import ScalarType
from vllm.utils import FlexibleArgumentParser
DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192]
ACT_ORDER_OPTS = [False, True]
K_FULL_OPTS = [False, True]
def bench_run(results: List[benchmark.Measurement], model: str,
def bench_run(results: list[benchmark.Measurement], model: str,
act_order: bool, is_k_full: bool, quant_type: ScalarType,
group_size: int, size_m: int, size_k: int, size_n: int):
label = "Quant Matmul"
@ -81,6 +81,27 @@ def bench_run(results: List[benchmark.Measurement], model: str,
GPTQ_MARLIN_24_MAX_PARALLEL)
marlin_zp = torch.zeros_like(marlin_s, dtype=torch.int)
# AllSpark W8A16 quant
as_supported_case = (quant_type in ALLSPARK_SUPPORTED_QUANT_TYPES
and group_size == -1 and not act_order and is_k_full)
if as_supported_case:
properties = torch.cuda.get_device_properties(b.device.index)
sm_count = properties.multi_processor_count
sm_version = properties.major * 10 + properties.minor
supported_arch = (sm_version >= 80 and sm_version < 90)
as_supported_case = as_supported_case and supported_arch
if supported_arch:
has_zp = False
w_ref, qw, s, zp = quantize_weights(b, quant_type, group_size,
has_zp)
qw = qw.to(torch.uint8)
qw_reorder, s_reorder, zp_reorder = \
ops.allspark_repack_weight(
qw, s, zp, has_zp)
CUBLAS_M_THRESHOLD = ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD
globals = {
# Gen params
"quant_type": quant_type,
@ -109,10 +130,19 @@ def bench_run(results: List[benchmark.Measurement], model: str,
# GPTQ params
"q_w_gptq": q_w_gptq,
"repack_sort_indices": repack_sort_indices,
# AllSpark W8A16 params
"qw_reorder": qw_reorder if as_supported_case else None,
"s_reorder": s_reorder if as_supported_case else None,
"zp_reorder": zp_reorder if as_supported_case else None,
"sm_count": sm_count if as_supported_case else None,
"sm_version": sm_version if as_supported_case else None,
"CUBLAS_M_THRESHOLD":
CUBLAS_M_THRESHOLD if as_supported_case else None,
# Kernels
"gptq_marlin_gemm": ops.gptq_marlin_gemm,
"gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
"gptq_marlin_repack": ops.gptq_marlin_repack,
"allspark_w8a16_gemm": ops.allspark_w8a16_gemm,
}
min_run_time = 1
@ -172,13 +202,24 @@ def bench_run(results: List[benchmark.Measurement], model: str,
description="gptq_marlin_repack",
).blocked_autorange(min_run_time=min_run_time))
if as_supported_case:
results.append(
benchmark.Timer(
stmt=
"output = allspark_w8a16_gemm(a, qw_reorder, s_reorder, zp_reorder, size_n, group_size, sm_count, sm_version, CUBLAS_M_THRESHOLD, False, True)", # noqa: E501
globals=globals,
label=label,
sub_label=sub_label,
description="allspark_w8a16_gemm_fp32",
).blocked_autorange(min_run_time=min_run_time))
def main(args):
print("Benchmarking models:")
for i, model in enumerate(args.models):
print(f"[{i}] {model}")
results: List[benchmark.Measurement] = []
results: list[benchmark.Measurement] = []
for model in args.models:
for layer in WEIGHT_SHAPES[model]:

View File

@ -1,10 +1,12 @@
# SPDX-License-Identifier: Apache-2.0
import argparse
import json
import time
from contextlib import nullcontext
from datetime import datetime
from itertools import product
from typing import Any, Dict, List, Tuple, TypedDict
from typing import Any, TypedDict
import ray
import torch
@ -16,8 +18,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import *
from vllm.platforms import current_platform
from vllm.utils import FlexibleArgumentParser
FP8_DTYPE = torch.float8_e4m3fnuz if current_platform.is_rocm(
) else torch.float8_e4m3fn
FP8_DTYPE = current_platform.fp8_dtype()
class BenchmarkConfig(TypedDict):
@ -40,6 +41,7 @@ def benchmark_config(
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
num_iters: int = 100,
block_quant_shape: List[int] = None,
) -> float:
init_dtype = torch.float16 if use_fp8_w8a8 else dtype
x = torch.randn(num_tokens, hidden_size, dtype=dtype)
@ -81,8 +83,24 @@ def benchmark_config(
dtype=torch.float32)
w2_scale = torch.randn((hidden_size, num_experts), dtype=torch.float32)
if use_fp8_w8a8:
w1_scale = torch.randn(num_experts, dtype=torch.float32)
w2_scale = torch.randn(num_experts, dtype=torch.float32)
if block_quant_shape:
block_n, block_k = block_quant_shape[0], block_quant_shape[1]
E = num_experts
N = shard_intermediate_size // 2
K = hidden_size
factor_for_scale = 1e-2
n_tiles_w1 = (2 * N + block_n - 1) // block_n
n_tiles_w2 = (K + block_n - 1) // block_n
k_tiles_w1 = (K + block_k - 1) // block_k
k_tiles_w2 = (N + block_k - 1) // block_k
w1_scale = torch.rand((E, n_tiles_w1, k_tiles_w1),
dtype=torch.float32) * factor_for_scale
w2_scale = torch.rand((E, n_tiles_w2, k_tiles_w2),
dtype=torch.float32) * factor_for_scale
else:
w1_scale = torch.randn(num_experts, dtype=torch.float32)
w2_scale = torch.randn(num_experts, dtype=torch.float32)
a1_scale = torch.randn(1, dtype=torch.float32)
a2_scale = torch.randn(1, dtype=torch.float32)
@ -111,6 +129,7 @@ def benchmark_config(
w2_scale=w2_scale,
a1_scale=a1_scale,
a2_scale=a2_scale,
block_shape=block_quant_shape,
)
# JIT compilation & warmup
@ -132,7 +151,7 @@ def benchmark_config(
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
latencies: List[float] = []
latencies: list[float] = []
for i in range(num_iters):
prepare(i)
torch.cuda.synchronize()
@ -175,8 +194,9 @@ def get_rocm_tuning_space(use_fp16):
return param_ranges
def get_configs_compute_bound(use_fp16) -> List[Dict[str, int]]:
configs: List[BenchmarkConfig] = []
def get_configs_compute_bound(use_fp16,
block_quant_shape) -> list[dict[str, int]]:
configs: list[BenchmarkConfig] = []
if current_platform.is_rocm():
param_ranges = get_rocm_tuning_space(use_fp16)
@ -204,17 +224,27 @@ def get_configs_compute_bound(use_fp16) -> List[Dict[str, int]]:
for config_values in product(*values):
config = dict(zip(keys, config_values))
configs.append(config)
# Remove configs that are not compatible with fp8 block quantization
# BLOCK_SIZE_K must be a multiple of block_k
# BLOCK_SIZE_N must be a multiple of block_n
if block_quant_shape is not None and not use_fp16:
block_n, block_k = block_quant_shape[0], block_quant_shape[1]
for config in configs[:]:
if config["BLOCK_SIZE_K"] % block_k != 0 or config[
"BLOCK_SIZE_N"] % block_n != 0:
configs.remove(config)
return configs
def prune_rocm_search_space(num_tokens, shard_intermediate_size, hidden_size,
search_space, is_fp16):
search_space, is_fp16, topk):
N1, K1 = shard_intermediate_size, hidden_size
N2, K2 = hidden_size, shard_intermediate_size // 2
pruned_space_1 = prune_rocm_configs(num_tokens * 2, N1, K1, search_space,
is_fp16)
pruned_space_2 = prune_rocm_configs(num_tokens * 2, N2, K2, search_space,
is_fp16)
pruned_space_1 = prune_rocm_configs(num_tokens * topk, N1, K1,
search_space, is_fp16)
pruned_space_2 = prune_rocm_configs(num_tokens * topk, N2, K2,
search_space, is_fp16)
search_space = merge_unique_dicts(pruned_space_1, pruned_space_2)
return search_space
@ -335,7 +365,8 @@ class BenchmarkWorker:
dtype: torch.dtype,
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
) -> Tuple[Dict[str, int], float]:
block_quant_shape: List[int] = None,
) -> tuple[dict[str, int], float]:
current_platform.seed_everything(self.seed)
dtype_str = get_config_dtype_str(dtype,
use_int8_w8a16=use_int8_w8a16,
@ -355,10 +386,17 @@ class BenchmarkWorker:
else:
config = op_config[min(op_config.keys(),
key=lambda x: abs(x - num_tokens))]
kernel_time = benchmark_config(config, num_tokens, num_experts,
shard_intermediate_size, hidden_size,
topk, dtype, use_fp8_w8a8,
use_int8_w8a16)
kernel_time = benchmark_config(config,
num_tokens,
num_experts,
shard_intermediate_size,
hidden_size,
topk,
dtype,
use_fp8_w8a8,
use_int8_w8a16,
num_iters=100,
block_quant_shape=block_quant_shape)
return config, kernel_time
def tune(
@ -371,8 +409,9 @@ class BenchmarkWorker:
dtype: torch.dtype,
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
search_space: List[Dict[str, int]],
) -> Dict[str, int]:
search_space: list[dict[str, int]],
block_quant_shape: list[int],
) -> dict[str, int]:
best_config = None
best_time = float("inf")
if current_platform.is_rocm():
@ -380,21 +419,24 @@ class BenchmarkWorker:
search_space = prune_rocm_search_space(num_tokens,
shard_intermediate_size,
hidden_size, search_space,
is_fp16)
is_fp16, topk)
with torch.cuda.device(self.device_id):
with torch.cuda.device(self.device_id) if current_platform.is_rocm(
) else nullcontext():
for config in tqdm(search_space):
try:
kernel_time = benchmark_config(config,
num_tokens,
num_experts,
shard_intermediate_size,
hidden_size,
topk,
dtype,
use_fp8_w8a8,
use_int8_w8a16,
num_iters=20)
kernel_time = benchmark_config(
config,
num_tokens,
num_experts,
shard_intermediate_size,
hidden_size,
topk,
dtype,
use_fp8_w8a8,
use_int8_w8a16,
num_iters=20,
block_quant_shape=block_quant_shape)
except triton.runtime.autotuner.OutOfResources:
# Some configurations may be invalid and fail to compile.
continue
@ -434,10 +476,10 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig:
}
def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int,
def save_configs(configs: dict[int, BenchmarkConfig], num_experts: int,
shard_intermediate_size: int, hidden_size: int, topk: int,
dtype: torch.dtype, use_fp8_w8a8: bool,
use_int8_w8a16: bool) -> None:
dtype: torch.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool,
block_quant_shape: List[int]) -> None:
dtype_str = get_config_dtype_str(dtype,
use_int8_w8a16=use_int8_w8a16,
use_fp8_w8a8=use_fp8_w8a8)
@ -445,7 +487,7 @@ def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int,
# NOTE(woosuk): The current naming convention uses w2.shape[2], which
# is the intermediate size after silu_and_mul.
filename = get_config_file_name(num_experts, shard_intermediate_size // 2,
dtype_str)
dtype_str, block_quant_shape)
print(f"Writing best config to {filename}...")
with open(filename, "w") as f:
@ -453,9 +495,17 @@ def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int,
f.write("\n")
def get_weight_block_size_safety(config, default_value=None):
quantization_config = getattr(config, 'quantization_config', {})
if isinstance(quantization_config, dict):
return quantization_config.get('weight_block_size', default_value)
return default_value
def main(args: argparse.Namespace):
print(args)
block_quant_shape = None
config = AutoConfig.from_pretrained(
args.model, trust_remote_code=args.trust_remote_code)
if config.architectures[0] == "DbrxForCausalLM":
@ -468,11 +518,18 @@ def main(args: argparse.Namespace):
topk = config.num_experts_per_tok
intermediate_size = config.intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
elif config.architectures[0] == "DeepseekV3ForCausalLM":
elif (config.architectures[0] == "DeepseekV3ForCausalLM"
or config.architectures[0] == "DeepseekV2ForCausalLM"):
E = config.n_routed_experts
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
block_quant_shape = get_weight_block_size_safety(config)
elif config.architectures[0] == "Qwen2MoeForCausalLM":
E = config.num_experts
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
else:
# Default: Mixtral.
E = config.num_local_experts
@ -497,7 +554,7 @@ def main(args: argparse.Namespace):
num_gpus = int(ray.available_resources()["GPU"])
workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
def _distribute(method: str, inputs: List[Any]) -> List[Any]:
def _distribute(method: str, inputs: list[Any]) -> list[Any]:
outputs = []
worker_idx = 0
for input_args in inputs:
@ -510,27 +567,30 @@ def main(args: argparse.Namespace):
if args.tune:
is_fp16 = not (use_fp8_w8a8 or use_int8_w8a16)
search_space = get_configs_compute_bound(is_fp16)
search_space = get_configs_compute_bound(is_fp16, block_quant_shape)
print(f"Start tuning over {len(search_space)} configurations...")
start = time.time()
configs = _distribute(
"tune", [(batch_size, E, shard_intermediate_size, hidden_size,
topk, dtype, use_fp8_w8a8, use_int8_w8a16, search_space)
for batch_size in batch_sizes])
"tune",
[(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
use_fp8_w8a8, use_int8_w8a16, search_space, block_quant_shape)
for batch_size in batch_sizes])
best_configs = {
M: sort_config(config)
for M, config in zip(batch_sizes, configs)
}
save_configs(best_configs, E, shard_intermediate_size, hidden_size,
topk, dtype, use_fp8_w8a8, use_int8_w8a16)
topk, dtype, use_fp8_w8a8, use_int8_w8a16,
block_quant_shape)
end = time.time()
print(f"Tuning took {end - start:.2f} seconds")
else:
outputs = _distribute(
"benchmark", [(batch_size, E, shard_intermediate_size, hidden_size,
topk, dtype, use_fp8_w8a8, use_int8_w8a16)
for batch_size in batch_sizes])
"benchmark",
[(batch_size, E, shard_intermediate_size, hidden_size, topk, dtype,
use_fp8_w8a8, use_int8_w8a16, block_quant_shape)
for batch_size in batch_sizes])
for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
print(f"Batch size: {batch_size}, config: {config}")

View File

@ -2,7 +2,7 @@
import random
import time
from typing import List, Optional
from typing import Optional
import torch
@ -11,8 +11,9 @@ from vllm.platforms import current_platform
from vllm.utils import (STR_DTYPE_TO_TORCH_DTYPE, FlexibleArgumentParser,
create_kv_caches_with_random)
NUM_BLOCKS = 1024
NUM_BLOCKS = 128 * 1024
PARTITION_SIZE = 512
PARTITION_SIZE_ROCM = 256
@torch.inference_mode()
@ -54,7 +55,7 @@ def main(
# Create the block tables.
max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
block_tables_lst: List[List[int]] = []
block_tables_lst: list[list[int]] = []
for _ in range(num_seqs):
block_table = [
random.randint(0, NUM_BLOCKS - 1)
@ -80,6 +81,12 @@ def main(
# Prepare for the paged attention kernel.
output = torch.empty_like(query)
if version == "v2":
if current_platform.is_rocm():
global PARTITION_SIZE
if not args.custom_paged_attn:
PARTITION_SIZE = 1024
else:
PARTITION_SIZE = PARTITION_SIZE_ROCM
num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
tmp_output = torch.empty(
size=(num_seqs, num_query_heads, num_partitions, head_size),
@ -123,32 +130,53 @@ def main(
v_scale,
)
elif version == "v2":
ops.paged_attention_v2(
output,
exp_sums,
max_logits,
tmp_output,
query,
key_cache,
value_cache,
num_kv_heads,
scale,
block_tables,
seq_lens,
block_size,
max_seq_len,
alibi_slopes,
kv_cache_dtype,
k_scale,
v_scale,
)
if not args.custom_paged_attn:
ops.paged_attention_v2(
output,
exp_sums,
max_logits,
tmp_output,
query,
key_cache,
value_cache,
num_kv_heads,
scale,
block_tables,
seq_lens,
block_size,
max_seq_len,
alibi_slopes,
kv_cache_dtype,
k_scale,
v_scale,
)
else:
ops.paged_attention_rocm(
output,
exp_sums,
max_logits,
tmp_output,
query,
key_cache,
value_cache,
num_kv_heads,
scale,
block_tables,
seq_lens,
block_size,
max_seq_len,
alibi_slopes,
kv_cache_dtype,
k_scale,
v_scale,
)
else:
raise ValueError(f"Invalid version: {version}")
torch.cuda.synchronize()
end_time = time.perf_counter()
if profile:
torch.cuda.cudart().cudaProfilerStart()
torch.cuda.cudart().cudaProfilerStop()
return (end_time - start_time) / num_iters
# Warmup.
@ -195,6 +223,9 @@ if __name__ == '__main__':
help="Data type for kv cache storage. If 'auto', will use model "
"data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. "
"ROCm (AMD GPU) supports fp8 (=fp8_e4m3)")
parser.add_argument("--custom-paged-attn",
action="store_true",
help="Use custom paged attention")
args = parser.parse_args()
print(args)

View File

@ -40,7 +40,7 @@ def main(num_tokens: int,
end_time = time.perf_counter()
if profile:
torch.cuda.cudart().cudaProfilerStart()
torch.cuda.cudart().cudaProfilerStop()
return (end_time - start_time) / num_iters
# Warmup.

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
import itertools
from typing import Optional, Tuple, Union
from typing import Optional, Union
import torch
import triton
@ -22,7 +22,7 @@ class HuggingFaceRMSNorm(nn.Module):
self,
x: torch.Tensor,
residual: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
orig_dtype = x.dtype
x = x.to(torch.float32)
if residual is not None:
@ -139,7 +139,7 @@ def calculate_diff(batch_size, seq_len, hidden_size, use_residual=True):
print(f"Naive output={output_naive}")
print(f"FlashInfer output={output_flashinfer}")
print(f"VLLM output={output_vllm}")
print(f"vLLM output={output_vllm}")
if torch.allclose(output_naive, output_flashinfer, atol=1e-2,
rtol=1e-2) and torch.allclose(

View File

@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
from itertools import accumulate
from typing import List, Optional
from typing import Optional
import nvtx
import torch
@ -39,7 +39,7 @@ def benchmark_rope_kernels_multi_lora(
})
# non-batched RoPE takes only one scaling factor, we create multiple
# instances to simulate the same behavior
non_batched_ropes: List[RotaryEmbedding] = []
non_batched_ropes: list[RotaryEmbedding] = []
for scaling_factor in scaling_factors:
non_batched_ropes.append(
get_rope(head_size, rotary_dim, max_position, base, is_neox_style,

View File

@ -0,0 +1,129 @@
# DeepSeek DeepGEMM Kernels Benchmark
This directory includes benchmarks between DeepSeek's DeepGEMM block fp8 kernels against vLLM's existing triton and CUTLASS-based kernels.
Currently this just includes dense GEMMs and only works on Hopper GPUs.
## Setup
You need to install vLLM in your usual fashion, then install DeepGEMM from source in its own directory:
```
git clone --recursive https://github.com/deepseek-ai/DeepGEMM
cd DeepGEMM
python setup.py install
uv pip install -e .
```
## Usage
```
python benchmark_fp8_block_dense_gemm.py
INFO 02-26 21:55:13 [__init__.py:207] Automatically detected platform cuda.
===== STARTING FP8 GEMM BENCHMARK =====
PyTorch version: 2.5.1+cu124
CUDA version: 12.4
Triton version: 3.1.0
Using device: NVIDIA H100 80GB HBM3
WARNING 02-26 21:55:15 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
INFO 02-26 21:55:15 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel.
WARNING 02-26 21:55:16 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=18432,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
WARNING 02-26 21:55:17 [fp8_utils.py:458] Using default W8A8 Block FP8 kernel config. Performance might be sub-optimal! Config file not found at /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json
INFO 02-26 21:55:17 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel.
INFO 02-26 21:55:17 [fp8_utils.py:449] Using configuration from /home/mgoin/code/vllm/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128,128].json for W8A8 Block FP8 kernel.
===== PERFORMANCE COMPARISON =====
DeepGEMM Implementation:
+------+-------+-------+-----------+--------+--------+
| m | n | k | Time (μs) | TFLOPS | GB/s |
+------+-------+-------+-----------+--------+--------+
| 8 | 4096 | 7168 | 102.9 | 4.6 | 286.4 |
| 8 | 7168 | 18432 | 70.8 | 29.8 | 1868.8 |
| 8 | 18432 | 7168 | 69.3 | 30.5 | 1911.8 |
| 64 | 4096 | 7168 | 69.1 | 54.4 | 439.0 |
| 64 | 7168 | 18432 | 69.4 | 243.6 | 1933.6 |
| 64 | 18432 | 7168 | 70.4 | 240.3 | 1917.2 |
| 64 | 24576 | 1536 | 70.1 | 68.9 | 584.6 |
| 64 | 32768 | 512 | 68.4 | 31.4 | 307.1 |
| 64 | 7168 | 16384 | 69.5 | 216.3 | 1718.5 |
| 128 | 4096 | 7168 | 141.1 | 53.3 | 222.1 |
| 128 | 7168 | 18432 | 71.9 | 470.5 | 1896.1 |
| 128 | 18432 | 7168 | 69.3 | 488.2 | 1988.2 |
| 1024 | 4096 | 7168 | 89.7 | 670.1 | 502.5 |
| 1024 | 18432 | 7168 | 279.0 | 969.8 | 635.2 |
| 2048 | 4096 | 7168 | 175.1 | 687.0 | 347.4 |
| 4096 | 4096 | 7168 | 335.4 | 717.0 | 275.1 |
+------+-------+-------+-----------+--------+--------+
vLLM Triton Implementation:
+------+-------+-------+-----------+--------+--------+--------------+
| m | n | k | Time (μs) | TFLOPS | GB/s | vs DeepGEMM |
+------+-------+-------+-----------+--------+--------+--------------+
| 8 | 4096 | 7168 | 74.0 | 6.3 | 398.2 | 1.39x faster |
| 8 | 7168 | 18432 | 89.6 | 23.6 | 1478.1 | 0.79x slower |
| 8 | 18432 | 7168 | 113.2 | 18.7 | 1170.4 | 0.61x slower |
| 64 | 4096 | 7168 | 79.4 | 47.3 | 382.2 | 0.87x slower |
| 64 | 7168 | 18432 | 98.5 | 171.7 | 1363.0 | 0.70x slower |
| 64 | 18432 | 7168 | 119.5 | 141.5 | 1129.4 | 0.59x slower |
| 64 | 24576 | 1536 | 37.6 | 128.4 | 1089.7 | 1.86x faster |
| 64 | 32768 | 512 | 38.7 | 55.5 | 542.6 | 1.77x faster |
| 64 | 7168 | 16384 | 86.1 | 174.5 | 1386.4 | 0.81x slower |
| 128 | 4096 | 7168 | 90.7 | 82.9 | 345.4 | 1.56x faster |
| 128 | 7168 | 18432 | 144.0 | 234.9 | 946.9 | 0.50x slower |
| 128 | 18432 | 7168 | 229.5 | 147.4 | 600.1 | 0.30x slower |
| 1024 | 4096 | 7168 | 242.3 | 248.2 | 186.1 | 0.37x slower |
| 1024 | 18432 | 7168 | 897.8 | 301.4 | 197.4 | 0.31x slower |
| 2048 | 4096 | 7168 | 463.0 | 259.7 | 131.4 | 0.38x slower |
| 4096 | 4096 | 7168 | 901.8 | 266.7 | 102.3 | 0.37x slower |
+------+-------+-------+-----------+--------+--------+--------------+
vLLM CUTLASS Implementation:
+------+-------+-------+-----------+--------+--------+--------------+--------------+
| m | n | k | Time (μs) | TFLOPS | GB/s | vs DeepGEMM | vs Triton |
+------+-------+-------+-----------+--------+--------+--------------+--------------+
| 8 | 4096 | 7168 | 34.6 | 13.6 | 852.3 | 2.98x faster | 2.14x faster |
| 8 | 7168 | 18432 | 78.9 | 26.8 | 1677.3 | 0.90x slower | 1.13x faster |
| 8 | 18432 | 7168 | 81.2 | 26.0 | 1631.1 | 0.85x slower | 1.39x faster |
| 64 | 4096 | 7168 | 36.9 | 101.9 | 822.9 | 1.87x faster | 2.15x faster |
| 64 | 7168 | 18432 | 87.4 | 193.4 | 1535.2 | 0.79x slower | 1.13x faster |
| 64 | 18432 | 7168 | 85.0 | 199.0 | 1587.6 | 0.83x slower | 1.41x faster |
| 64 | 24576 | 1536 | 28.0 | 172.8 | 1465.8 | 2.51x faster | 1.35x faster |
| 64 | 32768 | 512 | 28.8 | 74.5 | 728.5 | 2.37x faster | 1.34x faster |
| 64 | 7168 | 16384 | 77.9 | 193.0 | 1532.8 | 0.89x slower | 1.11x faster |
| 128 | 4096 | 7168 | 39.1 | 192.4 | 802.0 | 3.61x faster | 2.32x faster |
| 128 | 7168 | 18432 | 93.7 | 360.8 | 1454.2 | 0.77x slower | 1.54x faster |
| 128 | 18432 | 7168 | 85.7 | 394.8 | 1608.0 | 0.81x slower | 2.68x faster |
| 1024 | 4096 | 7168 | 99.7 | 603.1 | 452.2 | 0.90x slower | 2.43x faster |
| 1024 | 18432 | 7168 | 331.3 | 816.7 | 534.9 | 0.84x slower | 2.71x faster |
| 2048 | 4096 | 7168 | 198.3 | 606.6 | 306.7 | 0.88x slower | 2.34x faster |
| 4096 | 4096 | 7168 | 392.2 | 613.2 | 235.3 | 0.86x slower | 2.30x faster |
+------+-------+-------+-----------+--------+--------+--------------+--------------+
===== AVERAGE PERFORMANCE =====
+----------------+------------+----------+---------------+
| Implementation | Avg TFLOPS | Avg GB/s | Avg Time (ms) |
+----------------+------------+----------+---------------+
| DeepGEMM | 310.98 | 1052.10 | 0.11 |
| vLLM Triton | 144.30 | 715.60 | 0.23 |
| vLLM CUTLASS | 286.78 | 1076.67 | 0.11 |
+----------------+------------+----------+---------------+
===== AVERAGE SPEEDUPS =====
+-----------------------------+--------------+
| Comparison | Speedup |
+-----------------------------+--------------+
| DeepGEMM vs vLLM Triton | 1.71x faster |
| DeepGEMM vs vLLM CUTLASS | 0.94x slower |
| vLLM CUTLASS vs vLLM Triton | 1.84x faster |
+-----------------------------+--------------+
===== ACCURACY COMPARISON =====
+----------------+-----------------------+
| Implementation | Avg Diff vs Reference |
+----------------+-----------------------+
| DeepGEMM | 0.000684 |
| vLLM Triton | 0.000684 |
| vLLM CUTLASS | 0.000684 |
+----------------+-----------------------+
```

View File

@ -0,0 +1,464 @@
# SPDX-License-Identifier: Apache-2.0
# fmt: off
# ruff: noqa: E501
import time
# Import DeepGEMM functions
import deep_gemm
import torch
import triton
from deep_gemm import calc_diff, ceil_div, get_col_major_tma_aligned_tensor
# Import vLLM functions
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
per_token_group_quant_fp8, w8a8_block_fp8_matmul)
# Copied from
# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L9
def per_token_cast_to_fp8(
x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
"""Convert tensor to FP8 format with per-token scaling."""
assert x.dim() == 2 and x.size(1) % 128 == 0
m, n = x.shape
x_view = x.view(m, -1, 128)
x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
return (x_view * (448.0 / x_amax.unsqueeze(2))).to(
torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1)
# Copied from
# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L17
def per_block_cast_to_fp8(
x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
"""Convert tensor to FP8 format with per-block scaling."""
assert x.dim() == 2
m, n = x.shape
x_padded = torch.zeros((ceil_div(m, 128) * 128, ceil_div(n, 128) * 128),
dtype=x.dtype,
device=x.device)
x_padded[:m, :n] = x
x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
x_amax / 448.0).view(x_view.size(0), x_view.size(2))
def benchmark_shape(m: int,
n: int,
k: int,
warmup: int = 100,
repeat: int = 10000,
verbose: bool = False) -> dict:
"""Benchmark all implementations for a specific (m, n, k) shape."""
if verbose:
print(f"\n=== Benchmarking shape: m={m}, n={n}, k={k} ===")
# Create test tensors
A = torch.randn((m, k), device='cuda', dtype=torch.bfloat16)
B = torch.randn((n, k), device='cuda', dtype=torch.bfloat16)
# Reference result in BF16
torch.cuda.synchronize()
C_ref = A @ B.t()
# Pre-quantize B for all implementations
# (weights can be pre-quantized offline)
B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B)
B_vllm, B_scale_vllm = per_block_cast_to_fp8(B)
# Block size configuration
block_size = [128, 128]
# Pre-quantize A for all implementations
A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A)
A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
A, block_size[1], column_major_scales=True)
# === DeepGEMM Implementation ===
def deepgemm_gemm():
# A quantization is inside the loop as it depends on activations
# A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A)
# A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(
# A, block_size[1])
# A_scale_aligned = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
# C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm),
(B_deepgemm, B_scale_deepgemm),
C_deepgemm)
return C_deepgemm
# === vLLM Triton Implementation ===
def vllm_triton_gemm():
# A quantization is inside the loop as it depends on activations
# A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
return w8a8_block_fp8_matmul(A_vllm,
B_vllm,
A_scale_vllm,
B_scale_vllm,
block_size,
output_dtype=torch.bfloat16)
# === vLLM CUTLASS Implementation ===
def vllm_cutlass_gemm():
# A quantization is inside the loop as it depends on activations
# A_vllm_cutlass, A_scale_vllm_cutlass = per_token_group_quant_fp8(
# A, block_size[1], column_major_scales=True)
return ops.cutlass_scaled_mm(A_vllm_cutlass,
B_vllm.T,
scale_a=A_scale_vllm_cutlass,
scale_b=B_scale_vllm.T,
out_dtype=torch.bfloat16)
# Run correctness check first
if verbose:
print("Running correctness check...")
C_deepgemm = deepgemm_gemm()
C_vllm_triton = vllm_triton_gemm()
C_vllm_cutlass = vllm_cutlass_gemm()
deepgemm_diff = calc_diff(C_deepgemm, C_ref)
vllm_triton_diff = calc_diff(C_vllm_triton, C_ref)
vllm_cutlass_diff = calc_diff(C_vllm_cutlass, C_ref)
if verbose:
print(f"DeepGEMM vs Reference difference: {deepgemm_diff:.6f}")
print(f"vLLM Triton vs Reference difference: {vllm_triton_diff:.6f}")
print(f"vLLM CUTLASS vs Reference difference: {vllm_cutlass_diff:.6f}")
print("vLLM Triton vs DeepGEMM difference: "
f"{calc_diff(C_vllm_triton, C_deepgemm):.6f}")
print("vLLM CUTLASS vs DeepGEMM difference: "
f"{calc_diff(C_vllm_cutlass, C_deepgemm):.6f}")
# Benchmark implementations
implementations = {
"DeepGEMM": deepgemm_gemm,
"vLLM Triton": vllm_triton_gemm,
"vLLM CUTLASS": vllm_cutlass_gemm
}
benchmark_results = {
"shape": {
"m": m,
"n": n,
"k": k
},
"implementations": {}
}
for name, func in implementations.items():
# Warmup
for _ in range(warmup):
func()
torch.cuda.synchronize()
# Timing loop
torch.cuda.synchronize()
start = time.time()
for _ in range(repeat):
func()
torch.cuda.synchronize()
end = time.time()
# Calculate timing and TFLOPS
avg_time_ms = (end - start) / repeat * 1000
avg_time_us = avg_time_ms * 1000
tflops = 2 * m * n * k / (avg_time_ms * 1e-3) / 1e12
gb_s = (m * k + k * n + m * n * 2) / 1e9 / (avg_time_ms * 1e-3)
benchmark_results["implementations"][name] = {
"time_ms": avg_time_ms,
"time_us": avg_time_us,
"tflops": tflops,
"gb_s": gb_s,
"diff": {
"DeepGEMM":
0.0 if name == "DeepGEMM" else calc_diff(func(), C_deepgemm),
"Reference":
deepgemm_diff if name == "DeepGEMM" else
(vllm_triton_diff
if name == "vLLM Triton" else vllm_cutlass_diff)
}
}
if verbose:
print(
f"{name}: {avg_time_ms:.3f} ms, {tflops:.2f} TFLOPS, {gb_s:.2f} GB/s"
)
# Calculate speedups
baseline = benchmark_results["implementations"]["DeepGEMM"]["time_ms"]
for name, data in benchmark_results["implementations"].items():
if name != "DeepGEMM":
speedup = baseline / data["time_ms"]
benchmark_results["implementations"][name][
"speedup_vs_deepgemm"] = speedup
if verbose:
print(f"DeepGEMM is {1/speedup:.2f}x "
f"{'faster' if 1/speedup > 1 else 'slower'} than {name}")
vllm_triton_time = benchmark_results["implementations"]["vLLM Triton"][
"time_ms"]
vllm_cutlass_time = benchmark_results["implementations"]["vLLM CUTLASS"][
"time_ms"]
cutlass_vs_triton = vllm_triton_time / vllm_cutlass_time
benchmark_results["implementations"]["vLLM CUTLASS"][
"speedup_vs_triton"] = cutlass_vs_triton
if verbose:
print(
f"vLLM CUTLASS is {cutlass_vs_triton:.2f}x "
f"{'faster' if cutlass_vs_triton > 1 else 'slower'} than vLLM Triton"
)
return benchmark_results
def format_table_row(values, widths):
"""Format a row with specified column widths."""
return "| " + " | ".join(f"{val:{w}}"
for val, w in zip(values, widths)) + " |"
def print_table(headers, rows, title=None):
"""Print a table with headers and rows."""
if title:
print(f"\n{title}")
# Calculate column widths based on headers and data
widths = [
max(len(str(h)), max(len(str(row[i])) for row in rows))
for i, h in enumerate(headers)
]
# Create separator line
separator = "+-" + "-+-".join("-" * w for w in widths) + "-+"
# Print table
print(separator)
print(format_table_row(headers, widths))
print(separator)
for row in rows:
print(format_table_row(row, widths))
print(separator)
def format_speedup(value):
"""Format speedup value with indicator if it's faster or slower."""
return f"{value:.2f}x {'faster' if value > 1.0 else 'slower'}"
def run_benchmarks(verbose: bool = False):
"""Run benchmarks for a set of common shapes."""
print("===== STARTING FP8 GEMM BENCHMARK =====")
# Make sure we're using the GPU
if not torch.cuda.is_available():
print("CUDA not available! Tests require GPU.")
return
# Print system information
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA version: {torch.version.cuda}")
print(f"Triton version: {triton.__version__}")
print(f"Using device: {torch.cuda.get_device_name()}")
# Enable TF32 for better performance
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
# Set seeds for reproducibility
torch.manual_seed(42)
torch.cuda.manual_seed(42)
# Define benchmark shapes (m, n, k)
shapes = [
(8, 4096, 7168),
(8, 7168, 18432),
(8, 18432, 7168),
(64, 4096, 7168),
(64, 7168, 18432),
(64, 18432, 7168),
(64, 24576, 1536),
(64, 32768, 512),
(64, 7168, 16384),
(128, 4096, 7168),
(128, 7168, 18432),
(128, 18432, 7168),
(1024, 4096, 7168),
(1024, 18432, 7168),
(2048, 4096, 7168),
(4096, 4096, 7168),
]
shapes = [
# (64, 2112, 7168),
(64, 24576, 1536),
(64, 32768, 512),
(64, 7168, 16384),
(64, 4096, 7168),
(64, 7168, 2048),
# (128, 2112, 7168),
(128, 24576, 1536),
(128, 32768, 512),
(128, 7168, 16384),
(128, 4096, 7168),
(128, 7168, 2048),
# (4096, 2112, 7168),
(4096, 24576, 1536),
(4096, 32768, 512),
(4096, 7168, 16384),
(4096, 4096, 7168),
(4096, 7168, 2048),
]
all_results = []
for m, n, k in shapes:
result = benchmark_shape(m, n, k, verbose=verbose)
all_results.append(result)
# Print results in a nicely formatted table
print("\n===== PERFORMANCE COMPARISON =====")
# Print DeepGEMM table
deepgemm_headers = ["m", "n", "k", "Time (μs)", "TFLOPS", "GB/s"]
deepgemm_rows = []
for result in all_results:
shape = result["shape"]
impl_data = result["implementations"]["DeepGEMM"]
deepgemm_rows.append([
shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}"
])
print_table(deepgemm_headers,
deepgemm_rows,
title="DeepGEMM Implementation:")
# Print vLLM Triton table
triton_headers = [
"m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM"
]
triton_rows = []
for result in all_results:
shape = result["shape"]
impl_data = result["implementations"]["vLLM Triton"]
speedup = impl_data.get("speedup_vs_deepgemm", 1.0)
triton_rows.append([
shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
format_speedup(speedup)
])
print_table(triton_headers,
triton_rows,
title="vLLM Triton Implementation:")
# Print vLLM CUTLASS table
cutlass_headers = [
"m", "n", "k", "Time (μs)", "TFLOPS", "GB/s", "vs DeepGEMM",
"vs Triton"
]
cutlass_rows = []
for result in all_results:
shape = result["shape"]
impl_data = result["implementations"]["vLLM CUTLASS"]
vs_deepgemm = impl_data.get("speedup_vs_deepgemm", 1.0)
vs_triton = impl_data.get("speedup_vs_triton", 1.0)
cutlass_rows.append([
shape["m"], shape["n"], shape["k"], f"{impl_data['time_us']:.1f}",
f"{impl_data['tflops']:.1f}", f"{impl_data['gb_s']:.1f}",
format_speedup(vs_deepgemm),
format_speedup(vs_triton)
])
print_table(cutlass_headers,
cutlass_rows,
title="vLLM CUTLASS Implementation:")
# Calculate and print averages
print("\n===== AVERAGE PERFORMANCE =====")
implementations = ["DeepGEMM", "vLLM Triton", "vLLM CUTLASS"]
avg_metrics = {
impl: {
"tflops": 0,
"gb_s": 0,
"time_ms": 0
}
for impl in implementations
}
for result in all_results:
for impl in implementations:
impl_data = result["implementations"][impl]
avg_metrics[impl]["tflops"] += impl_data["tflops"]
avg_metrics[impl]["gb_s"] += impl_data["gb_s"]
avg_metrics[impl]["time_ms"] += impl_data["time_ms"]
num_shapes = len(all_results)
avg_headers = ["Implementation", "Avg TFLOPS", "Avg GB/s", "Avg Time (ms)"]
avg_rows = []
for impl in implementations:
avg_tflops = avg_metrics[impl]["tflops"] / num_shapes
avg_mem_bw = avg_metrics[impl]["gb_s"] / num_shapes
avg_time = avg_metrics[impl]["time_ms"] / num_shapes
avg_rows.append([
impl, f"{avg_tflops:.2f}", f"{avg_mem_bw:.2f}", f"{avg_time:.2f}"
])
print_table(avg_headers, avg_rows)
# Calculate average speedups
avg_speedups = {
"DeepGEMM vs vLLM Triton": 0,
"DeepGEMM vs vLLM CUTLASS": 0,
"vLLM CUTLASS vs vLLM Triton": 0
}
for result in all_results:
deepgemm_time = result["implementations"]["DeepGEMM"]["time_ms"]
vllm_triton_time = result["implementations"]["vLLM Triton"]["time_ms"]
vllm_cutlass_time = result["implementations"]["vLLM CUTLASS"][
"time_ms"]
avg_speedups[
"DeepGEMM vs vLLM Triton"] += vllm_triton_time / deepgemm_time
avg_speedups[
"DeepGEMM vs vLLM CUTLASS"] += vllm_cutlass_time / deepgemm_time
avg_speedups[
"vLLM CUTLASS vs vLLM Triton"] += vllm_triton_time / vllm_cutlass_time
print("\n===== AVERAGE SPEEDUPS =====")
speedup_headers = ["Comparison", "Speedup"]
speedup_rows = []
for comparison, total in avg_speedups.items():
avg_speedup = total / num_shapes
status = "faster" if avg_speedup > 1 else "slower"
speedup_rows.append([comparison, f"{avg_speedup:.2f}x {status}"])
print_table(speedup_headers, speedup_rows)
# Average accuracy comparison
print("\n===== ACCURACY COMPARISON =====")
avg_diff = {impl: 0 for impl in implementations}
for result in all_results:
for impl in implementations:
avg_diff[impl] += result["implementations"][impl]["diff"][
"Reference"]
diff_headers = ["Implementation", "Avg Diff vs Reference"]
diff_rows = []
for impl in implementations:
diff_rows.append([impl, f"{avg_diff[impl] / num_shapes:.6f}"])
print_table(diff_headers, diff_rows)
if __name__ == "__main__":
run_benchmarks(verbose=False)

View File

@ -4,7 +4,6 @@ import math
import pickle
import re
from collections import defaultdict
from typing import List
import matplotlib.pyplot as plt
import pandas as pd
@ -23,7 +22,7 @@ if __name__ == "__main__":
with open(args.filename, 'rb') as f:
data = pickle.load(f)
raw_results: List[TMeasurement] = data["results"]
raw_results: list[TMeasurement] = data["results"]
results = defaultdict(lambda: list())
for v in raw_results:

View File

@ -1,7 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
import dataclasses
from typing import Any, Callable, Iterable, Optional
from collections.abc import Iterable
from typing import Any, Callable, Optional
import torch
import torch.utils.benchmark as TBenchmark

View File

@ -0,0 +1,64 @@
#!/bin/bash
# Define the model to use
MODEL=${1:-"Qwen/Qwen2.5-7B-Instruct"}
# Define the backend to use
BACKEND=${2:-"vllm"}
# Define the dataset to use
DATASET=${3:-"xgrammar_bench"}
# Define the guided decoding backend
GUIDED_BACKEND=${4:-"xgrammar"}
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
OUTPUT_DIR=${5:-"$SCRIPT_DIR/structured_output_benchmark_results"}
GUIDED_RATIO=${6:-0.5}
# Create output directory if it doesn't exist
mkdir -p "$OUTPUT_DIR"
# Define QPS values to test
QPS_VALUES=(70 60 50 25 20 15 10)
# Common parameters
COMMON_PARAMS="--backend $BACKEND \
--model $MODEL \
--dataset $DATASET \
--structured-output-backend $GUIDED_BACKEND \
--structured-output-ratio $GUIDED_RATIO \
--save-results \
--result-dir $OUTPUT_DIR"
echo "Starting structured output benchmark with model: $MODEL"
echo "Backend: $BACKEND"
echo "Dataset: $DATASET"
echo "Structured output backend: $GUIDED_BACKEND"
echo "Results will be saved to: $OUTPUT_DIR"
echo "----------------------------------------"
# Run benchmarks with different QPS values
for qps in "${QPS_VALUES[@]}"; do
echo "Running benchmark with QPS: $qps"
# Get git hash and branch for the filename
GIT_HASH=$(git rev-parse --short HEAD 2>/dev/null || echo "unknown")
GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown")
# Construct filename for this run
FILENAME="${GUIDED_BACKEND}_${BACKEND}_${qps}qps_$(basename $MODEL)_${DATASET}_${GIT_HASH}.json"
# Run the benchmark
python "$SCRIPT_DIR/benchmark_serving_structured_output.py" $COMMON_PARAMS \
--request-rate $qps \
--result-filename "$FILENAME" \
--port ${PORT:-8000}
echo "Completed benchmark with QPS: $qps"
echo "----------------------------------------"
done
echo "All benchmarks completed!"
echo "Results saved to: $OUTPUT_DIR"

View File

@ -1,113 +1,19 @@
{
"$schema":
"https://json-schema.org/draft/2020-12/schema",
"title":
"User Profile",
"type":
"object",
"type": "object",
"properties": {
"userId": {
"type": "string",
"description": "Unique identifier for the user."
},
"personalInfo": {
"type": "object",
"properties": {
"firstName": {
"type": "string",
"description": "The user's first name."
},
"lastName": {
"type": "string",
"description": "The user's last name."
},
"age": {
"type": "integer",
"minimum": 0,
"description": "The user's age."
},
"phoneNumbers": {
"type":
"array",
"items": {
"type": "object",
"properties": {
"type": {
"type": "string",
"enum": ["home", "work", "mobile"],
"description": "Type of phone number."
},
"number": {
"type": "string",
"pattern": "^\\+?[1-9]\\d{1,14}$",
"description": "Phone number in E.164 format."
}
},
"required": ["type", "number"]
},
"description":
"List of phone numbers associated with the user."
}
},
"required": ["firstName", "lastName"]
},
"address": {
"type": "object",
"properties": {
"street": {
"type": "string",
"description": "Street address."
},
"city": {
"type": "string",
"description": "City name."
},
"state": {
"type": "string",
"description": "State or province."
},
"postalCode": {
"type": "string",
"pattern": "^\\d{5}(-\\d{4})?$",
"description": "Postal code."
},
"country": {
"type": "string",
"description": "Country name."
}
},
"required": ["street", "city", "state", "postalCode", "country"]
},
"preferences": {
"type": "object",
"properties": {
"newsletterSubscribed": {
"type":
"boolean",
"description":
"Indicates if the user is subscribed to the newsletter."
},
"favoriteCategories": {
"type": "array",
"items": {
"type": "string"
},
"description": "List of user's favorite categories."
}
},
"required": ["newsletterSubscribed"]
},
"accountStatus": {
"type": "string",
"enum": ["active", "inactive", "suspended"],
"description": "Current status of the user's account."
},
"registrationDate": {
"type": "string",
"format": "date-time",
"description": "ISO 8601 formatted date-time of user registration."
}
"name": { "type": "string" },
"email": { "type": "string" },
"street": { "type": "string" },
"city": { "type": "string" },
"state": { "type": "string" },
"zip": { "type": "string" },
"phone": { "type": "string" },
"website": { "type": "string" },
"company": { "type": "string" },
"age": { "type": "integer" }
},
"required":
["userId", "personalInfo", "address", "accountStatus", "registrationDate"]
}
"required": [
"name",
"email"
]
}

View File

@ -81,6 +81,7 @@ else()
find_isa(${CPUINFO} "POWER9" POWER9_FOUND)
find_isa(${CPUINFO} "asimd" ASIMD_FOUND) # Check for ARM NEON support
find_isa(${CPUINFO} "bf16" ARM_BF16_FOUND) # Check for ARM BF16 support
find_isa(${CPUINFO} "S390" S390_FOUND)
endif()
@ -129,8 +130,16 @@ elseif (ASIMD_FOUND)
elseif(APPLE_SILICON_FOUND)
message(STATUS "Apple Silicon Detected")
set(ENABLE_NUMA OFF)
elseif (S390_FOUND)
message(STATUS "S390 detected")
# Check for S390 VXE support
list(APPEND CXX_COMPILE_FLAGS
"-mvx"
"-mzvector"
"-march=native"
"-mtune=native")
else()
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA or ARMv8 support.")
message(FATAL_ERROR "vLLM CPU backend requires AVX512, AVX2, Power9+ ISA, S390X ISA or ARMv8 support.")
endif()
#
@ -140,7 +149,7 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
FetchContent_Declare(
oneDNN
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
GIT_TAG v3.6
GIT_TAG v3.7.1
GIT_PROGRESS TRUE
GIT_SHALLOW TRUE
)

View File

@ -0,0 +1,66 @@
include(FetchContent)
# If FLASH_MLA_SRC_DIR is set, flash-mla is installed from that directory
# instead of downloading.
# It can be set as an environment variable or passed as a cmake argument.
# The environment variable takes precedence.
if (DEFINED ENV{FLASH_MLA_SRC_DIR})
set(FLASH_MLA_SRC_DIR $ENV{FLASH_MLA_SRC_DIR})
endif()
if(FLASH_MLA_SRC_DIR)
FetchContent_Declare(
flashmla
SOURCE_DIR ${FLASH_MLA_SRC_DIR}
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
)
else()
FetchContent_Declare(
flashmla
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git
GIT_TAG 575f7724b9762f265bbee5889df9c7d630801845
GIT_PROGRESS TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
)
endif()
FetchContent_MakeAvailable(flashmla)
message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
# The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
# Only build FlashMLA kernels if we are building for something compatible with
# sm90a
cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
set(FlashMLA_SOURCES
${flashmla_SOURCE_DIR}/csrc/flash_api.cpp
${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_bf16_sm90.cu
${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_fp16_sm90.cu
${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_metadata.cu)
set(FlashMLA_INCLUDES
${flashmla_SOURCE_DIR}/csrc/cutlass/include
${flashmla_SOURCE_DIR}/csrc/include)
set_gencode_flags_for_srcs(
SRCS "${FlashMLA_SOURCES}"
CUDA_ARCHS "${FLASH_MLA_ARCHS}")
define_gpu_extension_target(
_flashmla_C
DESTINATION vllm
LANGUAGE ${VLLM_GPU_LANG}
SOURCES ${FlashMLA_SOURCES}
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
INCLUDE_DIRECTORIES ${FlashMLA_INCLUDES}
USE_SABI 3
WITH_SOABI)
else()
# Create an empty target for setup.py when not targeting sm90a systems
add_custom_target(_flashmla_C)
endif()

View File

@ -0,0 +1,67 @@
# vLLM flash attention requires VLLM_GPU_ARCHES to contain the set of target
# arches in the CMake syntax (75-real, 89-virtual, etc), since we clear the
# arches in the CUDA case (and instead set the gencodes on a per file basis)
# we need to manually set VLLM_GPU_ARCHES here.
if(VLLM_GPU_LANG STREQUAL "CUDA")
foreach(_ARCH ${CUDA_ARCHS})
string(REPLACE "." "" _ARCH "${_ARCH}")
list(APPEND VLLM_GPU_ARCHES "${_ARCH}-real")
endforeach()
endif()
#
# Build vLLM flash attention from source
#
# IMPORTANT: This has to be the last thing we do, because vllm-flash-attn uses the same macros/functions as vLLM.
# Because functions all belong to the global scope, vllm-flash-attn's functions overwrite vLLMs.
# They should be identical but if they aren't, this is a massive footgun.
#
# The vllm-flash-attn install rules are nested under vllm to make sure the library gets installed in the correct place.
# To only install vllm-flash-attn, use --component _vllm_fa2_C (for FA2) or --component _vllm_fa3_C (for FA3).
# If no component is specified, vllm-flash-attn is still installed.
# If VLLM_FLASH_ATTN_SRC_DIR is set, vllm-flash-attn is installed from that directory instead of downloading.
# This is to enable local development of vllm-flash-attn within vLLM.
# It can be set as an environment variable or passed as a cmake argument.
# The environment variable takes precedence.
if (DEFINED ENV{VLLM_FLASH_ATTN_SRC_DIR})
set(VLLM_FLASH_ATTN_SRC_DIR $ENV{VLLM_FLASH_ATTN_SRC_DIR})
endif()
if(VLLM_FLASH_ATTN_SRC_DIR)
FetchContent_Declare(
vllm-flash-attn SOURCE_DIR
${VLLM_FLASH_ATTN_SRC_DIR}
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
)
else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 9bfa9869829d8c593527eb34c5271d0090f7ccc9
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
)
endif()
# Fetch the vllm-flash-attn library
FetchContent_MakeAvailable(vllm-flash-attn)
message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
# Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
# case only one is built, in the case both are built redundant work is done)
install(
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
DESTINATION vllm_flash_attn
COMPONENT _vllm_fa2_C
FILES_MATCHING PATTERN "*.py"
)
install(
DIRECTORY ${vllm-flash-attn_SOURCE_DIR}/vllm_flash_attn/
DESTINATION vllm_flash_attn
COMPONENT _vllm_fa3_C
FILES_MATCHING PATTERN "*.py"
)

View File

@ -257,9 +257,9 @@ endmacro()
# where `<=` is the version comparison operator.
# In other words, for each version in `TGT_CUDA_ARCHS` find the highest version
# in `SRC_CUDA_ARCHS` that is less or equal to the version in `TGT_CUDA_ARCHS`.
# We have special handling for 9.0a, if 9.0a is in `SRC_CUDA_ARCHS` and 9.0 is
# in `TGT_CUDA_ARCHS` then we should remove 9.0a from `SRC_CUDA_ARCHS` and add
# 9.0a to the result (and remove 9.0 from TGT_CUDA_ARCHS).
# We have special handling for x.0a, if x.0a is in `SRC_CUDA_ARCHS` and x.0 is
# in `TGT_CUDA_ARCHS` then we should remove x.0a from `SRC_CUDA_ARCHS` and add
# x.0a to the result (and remove x.0 from TGT_CUDA_ARCHS).
# The result is stored in `OUT_CUDA_ARCHS`.
#
# Example:
@ -272,8 +272,8 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
list(REMOVE_DUPLICATES SRC_CUDA_ARCHS)
set(TGT_CUDA_ARCHS_ ${TGT_CUDA_ARCHS})
# if 9.0a is in SRC_CUDA_ARCHS and 9.0 is in CUDA_ARCHS then we should
# remove 9.0a from SRC_CUDA_ARCHS and add 9.0a to _CUDA_ARCHS
# if x.0a is in SRC_CUDA_ARCHS and x.0 is in CUDA_ARCHS then we should
# remove x.0a from SRC_CUDA_ARCHS and add x.0a to _CUDA_ARCHS
set(_CUDA_ARCHS)
if ("9.0a" IN_LIST SRC_CUDA_ARCHS)
list(REMOVE_ITEM SRC_CUDA_ARCHS "9.0a")
@ -283,6 +283,14 @@ function(cuda_archs_loose_intersection OUT_CUDA_ARCHS SRC_CUDA_ARCHS TGT_CUDA_AR
endif()
endif()
if ("10.0a" IN_LIST SRC_CUDA_ARCHS)
list(REMOVE_ITEM SRC_CUDA_ARCHS "10.0a")
if ("10.0" IN_LIST TGT_CUDA_ARCHS)
list(REMOVE_ITEM TGT_CUDA_ARCHS_ "10.0")
set(_CUDA_ARCHS "10.0a")
endif()
endif()
list(SORT SRC_CUDA_ARCHS COMPARE NATURAL ORDER ASCENDING)
# for each ARCH in TGT_CUDA_ARCHS find the highest arch in SRC_CUDA_ARCHS that

View File

@ -39,3 +39,10 @@ void concat_and_cache_mla(torch::Tensor& kv_c, torch::Tensor& k_pe,
// Just for unittest
void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
const double scale, const std::string& kv_cache_dtype);
void gather_cache(
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
torch::Tensor const& cu_seq_lens, // [BATCH+1]
int64_t batch_size, std::optional<torch::Tensor> seq_starts = std::nullopt);

View File

@ -2,6 +2,7 @@
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include "cuda_utils.h"
#include "cuda_compat.h"
#include "dispatch_utils.h"
@ -374,7 +375,7 @@ void reshape_and_cache(
torch::Tensor& slot_mapping, // [num_tokens]
const std::string& kv_cache_dtype, torch::Tensor& k_scale,
torch::Tensor& v_scale) {
int num_tokens = key.size(0);
int num_tokens = slot_mapping.size(0);
int num_heads = key.size(1);
int head_size = key.size(2);
int block_size = key_cache.size(3);
@ -570,3 +571,161 @@ void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
TORCH_CHECK(false, "Unsupported data type: ", kv_cache_dtype);
}
}
namespace vllm {
// grid is launched with dimensions (batch, num_splits)
template <typename scalar_t>
__global__ void gather_cache(
const scalar_t* __restrict__ src_cache, // [NUM_BLOCKS, BLOCK_SIZE,
// ENTRIES...]
scalar_t* __restrict__ dst, // [TOT_TOKENS, ENTRIES...]
const int32_t* __restrict__ block_table, // [BATCH, BLOCK_INDICES]
const int32_t* __restrict__ cu_seq_lens, // [BATCH+1]
const int32_t block_size, const int32_t entry_size,
const int64_t block_table_stride, const int64_t cache_block_stride,
const int64_t cache_entry_stride, const int64_t dst_entry_stride,
const int32_t* __restrict__ seq_starts) { // Optional: starting offsets per
// batch
const int64_t bid = blockIdx.x; // Batch ID
const int32_t num_splits = gridDim.y;
const int32_t split = blockIdx.y;
const int32_t seq_start = cu_seq_lens[bid];
const int32_t seq_end = cu_seq_lens[bid + 1];
const int32_t seq_len = seq_end - seq_start;
const int32_t tot_blocks = cuda_utils::ceil_div(seq_len, block_size);
const int32_t split_blocks = cuda_utils::ceil_div(tot_blocks, num_splits);
const int32_t split_start = split * split_blocks;
const int32_t split_end = min((split + 1) * split_blocks, tot_blocks);
const bool is_active_split = (split_start < tot_blocks);
const bool is_last_split = (split_end == tot_blocks);
if (!is_active_split) return;
int32_t full_blocks_end = split_end;
int32_t partial_block_size = 0;
// Adjust the pointer for the block_table for this batch.
// If seq_starts is provided, compute an offset based on (seq_starts[bid] /
// page_size)
const int32_t batch_offset = bid * block_table_stride;
int32_t offset = 0;
if (seq_starts != nullptr) {
offset = seq_starts[bid] / block_size;
}
const int32_t* batch_block_table = block_table + batch_offset + offset;
// Adjust dst pointer based on the cumulative sequence lengths.
dst += seq_start * dst_entry_stride;
if (is_last_split) {
partial_block_size = seq_len % block_size;
if (partial_block_size) full_blocks_end -= 1;
}
auto copy_entry = [&](const scalar_t* __restrict__ _src,
scalar_t* __restrict__ _dst) {
for (int i = threadIdx.x; i < entry_size; i += blockDim.x)
_dst[i] = _src[i];
};
for (int pid = split_start; pid < full_blocks_end; ++pid) {
auto block_id = batch_block_table[pid];
auto block_start_ptr = src_cache + block_id * cache_block_stride;
auto block_dst_ptr = dst + pid * block_size * dst_entry_stride;
for (int eid = 0; eid < block_size; ++eid) {
copy_entry(block_start_ptr + eid * cache_entry_stride,
block_dst_ptr + eid * dst_entry_stride);
}
}
if (partial_block_size) {
auto block_id = batch_block_table[full_blocks_end];
auto block_start_ptr = src_cache + block_id * cache_block_stride;
auto block_dst_ptr = dst + full_blocks_end * block_size * dst_entry_stride;
for (int eid = 0; eid < partial_block_size; ++eid) {
copy_entry(block_start_ptr + eid * cache_entry_stride,
block_dst_ptr + eid * dst_entry_stride);
}
}
}
} // namespace vllm
// Macro to dispatch the kernel based on the data type.
#define CALL_GATHER_CACHE(CPY_DTYPE) \
vllm::gather_cache<CPY_DTYPE><<<grid, block, 0, stream>>>( \
reinterpret_cast<CPY_DTYPE*>(src_cache.data_ptr()), \
reinterpret_cast<CPY_DTYPE*>(dst.data_ptr()), \
block_table.data_ptr<int32_t>(), cu_seq_lens.data_ptr<int32_t>(), \
block_size, entry_size, block_table_stride, cache_block_stride, \
cache_entry_stride, dst_entry_stride, seq_starts_ptr);
// Gather sequences from the cache into the destination tensor.
// - cu_seq_lens contains the cumulative sequence lengths for each batch
// - block_table contains the cache block indices for each sequence
// - Optionally, seq_starts (if provided) offsets the starting block index by
// (seq_starts[bid] / page_size)
void gather_cache(
torch::Tensor const& src_cache, // [NUM_BLOCKS, BLOCK_SIZE, ENTRIES...]
torch::Tensor const& dst, // [TOT_TOKENS, ENTRIES...]
torch::Tensor const& block_table, // [BATCH, BLOCK_INDICES]
torch::Tensor const& cu_seq_lens, // [BATCH+1]
int64_t batch_size,
std::optional<torch::Tensor> seq_starts = std::nullopt) {
at::cuda::OptionalCUDAGuard device_guard(src_cache.device());
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
int32_t block_size = src_cache.size(1);
int32_t entry_size = src_cache.flatten(2, -1).size(2);
TORCH_CHECK(block_table.dtype() == torch::kInt32,
"block_table must be int32");
TORCH_CHECK(cu_seq_lens.dtype() == torch::kInt32,
"cu_seq_lens must be int32");
if (seq_starts.has_value()) {
TORCH_CHECK(seq_starts.value().dtype() == torch::kInt32,
"seq_starts must be int32");
}
TORCH_CHECK(src_cache.device() == dst.device(),
"src_cache and dst must be on the same device");
TORCH_CHECK(src_cache.device() == block_table.device(),
"src_cache and block_table must be on the same device");
TORCH_CHECK(src_cache.device() == cu_seq_lens.device(),
"src_cache and cu_seq_lens must be on the same device");
if (seq_starts.has_value()) {
TORCH_CHECK(src_cache.device() == seq_starts.value().device(),
"src_cache and seq_starts must be on the same device");
}
int64_t block_table_stride = block_table.stride(0);
int64_t cache_block_stride = src_cache.stride(0);
int64_t cache_entry_stride = src_cache.stride(1);
int64_t dst_entry_stride = dst.stride(0);
// Decide on the number of splits based on the batch size.
int num_splits = batch_size > 128 ? 2 : batch_size > 64 ? 4 : 16;
dim3 grid(batch_size, num_splits);
dim3 block(1024);
TORCH_CHECK(src_cache.dtype() == dst.dtype(),
"src_cache and dst must have the same dtype");
const int dtype_bits = src_cache.element_size() * 8;
const int32_t* seq_starts_ptr =
seq_starts.has_value() ? seq_starts.value().data_ptr<int32_t>() : nullptr;
if (dtype_bits == 32) {
CALL_GATHER_CACHE(uint32_t);
} else if (dtype_bits == 16) {
CALL_GATHER_CACHE(uint16_t);
} else if (dtype_bits == 8) {
CALL_GATHER_CACHE(uint8_t);
} else {
TORCH_CHECK(false, "Unsupported data type width: ", dtype_bits);
}
}

View File

@ -7,8 +7,3 @@ inline constexpr uint32_t next_pow_2(uint32_t const num) {
if (num <= 1) return num;
return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
}
template <typename T>
inline constexpr std::enable_if_t<std::is_integral_v<T>, T> ceil_div(T a, T b) {
return (a + b - 1) / b;
}

View File

@ -24,8 +24,8 @@ struct KernelVecType<float> {
template <>
struct KernelVecType<c10::Half> {
#ifdef __powerpc64__
// Power architecture-specific vector types
#if defined(__powerpc64__) || defined(__s390x__)
// Power and s390x architecture-specific vector types
using q_load_vec_type = vec_op::FP32Vec8;
using k_load_vec_type = vec_op::FP32Vec16;
using v_load_vec_type = vec_op::FP32Vec16;

View File

@ -3,6 +3,12 @@
#include "cpu_types.hpp"
#if defined(__x86_64__)
#define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES_WITH_E5M2
#else
#define DISPATCH_MACRO VLLM_DISPATCH_FLOATING_TYPES
#endif
namespace {
template <typename scalar_t>
void copy_blocks_cpu_impl(std::vector<torch::Tensor> const& key_caches,
@ -95,13 +101,12 @@ void copy_blocks(std::vector<torch::Tensor> const& key_caches,
}
const int element_num_per_block = key_caches[0][0].numel();
VLLM_DISPATCH_FLOATING_TYPES(
key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] {
CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl)
copy_blocks_cpu_impl<scalar_t>(key_caches, value_caches, block_mapping,
element_num_per_block, num_layers);
CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl)
});
DISPATCH_MACRO(key_caches[0].scalar_type(), "copy_blocks_cpu_impl", [&] {
CPU_KERNEL_GUARD_IN(copy_blocks_cpu_impl)
copy_blocks_cpu_impl<scalar_t>(key_caches, value_caches, block_mapping,
element_num_per_block, num_layers);
CPU_KERNEL_GUARD_OUT(copy_blocks_cpu_impl)
});
}
void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
@ -118,16 +123,15 @@ void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
int key_stride = key.stride(0);
int value_stride = value.stride(0);
VLLM_DISPATCH_FLOATING_TYPES(
key.scalar_type(), "reshape_and_cache_cpu_impl", [&] {
CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl)
reshape_and_cache_cpu_impl<scalar_t>(
key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride,
value_stride, num_heads, head_size, block_size, x);
CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl)
});
DISPATCH_MACRO(key.scalar_type(), "reshape_and_cache_cpu_impl", [&] {
CPU_KERNEL_GUARD_IN(reshape_and_cache_cpu_impl)
reshape_and_cache_cpu_impl<scalar_t>(
key.data_ptr<scalar_t>(), value.data_ptr<scalar_t>(),
key_cache.data_ptr<scalar_t>(), value_cache.data_ptr<scalar_t>(),
slot_mapping.data_ptr<int64_t>(), num_tokens, key_stride, value_stride,
num_heads, head_size, block_size, x);
CPU_KERNEL_GUARD_OUT(reshape_and_cache_cpu_impl)
});
}
void swap_blocks(torch::Tensor& src, torch::Tensor& dst,

View File

@ -7,6 +7,9 @@
#elif defined(__POWER9_VECTOR__)
// ppc implementation
#include "cpu_types_vsx.hpp"
#elif defined(__s390x__)
// s390 implementation
#include "cpu_types_vxe.hpp"
#elif defined(__aarch64__)
// arm implementation
#include "cpu_types_arm.hpp"

View File

@ -2,6 +2,10 @@
#include <torch/all.h>
#include <cmath>
#if defined(__APPLE__)
#include "omp.h"
#endif
namespace vec_op {
#ifdef ARM_BF16_SUPPORT

480
csrc/cpu/cpu_types_vxe.hpp Normal file
View File

@ -0,0 +1,480 @@
#ifndef CPU_TYPES_VXE_HPP
#define CPU_TYPES_VXE_HPP
#include <vecintrin.h>
#include <cmath>
#include <torch/all.h>
namespace vec_op {
#define vec_neg(a) (-(a))
#define vec_add(a, b) ((a) + (b))
#define vec_sub(a, b) ((a) - (b))
#define vec_mul(a, b) ((a) * (b))
#define vec_div(a, b) ((a) / (b))
#define vec_sr(a, b) ((a) >> (b)) // Vector Shift Right Algebaic
#define vec_sl(a, b) ((a) << (b)) // Vector Shift Left
// FIXME: FP16 is not fully supported in Torch-CPU
#define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
#ifndef CPU_OP_GUARD
#define CPU_KERNEL_GUARD_IN(NAME)
#define CPU_KERNEL_GUARD_OUT(NAME)
#else
#define CPU_KERNEL_GUARD_IN(NAME) \
std::cout << #NAME << " invoked." << std::endl;
#define CPU_KERNEL_GUARD_OUT(NAME) \
std::cout << #NAME << " exit." << std::endl;
#endif
#define FORCE_INLINE __attribute__((always_inline)) inline
namespace {
template <typename T, T... indexes, typename F>
constexpr void unroll_loop_item(std::integer_sequence<T, indexes...>, F&& f) {
(f(std::integral_constant<T, indexes>{}), ...);
}
}; // namespace
template <typename T, T count, typename F,
typename = std::enable_if_t<std::is_invocable_v<F, T>>>
constexpr void unroll_loop(F&& f) {
unroll_loop_item(std::make_integer_sequence<T, count>{}, std::forward<F>(f));
}
template <typename T>
struct Vec {
constexpr static int get_elem_num() { return T::VEC_ELEM_NUM; }
};
typedef struct ss16x8x2_t {
__vector signed short val[2];
} ss16x8x2_t;
typedef struct ss16x8x4_t {
__vector signed short val[4];
} ss16x8x4_t;
typedef struct f32x4x2_t {
__vector float val[2];
} f32x4x2_t;
typedef struct f32x4x4_t {
__vector float val[4];
} f32x4x4_t;
struct FP32Vec8;
struct FP32Vec16;
struct BF16Vec8 : public Vec<BF16Vec8> {
constexpr static int VEC_ELEM_NUM = 8;
__vector signed short reg;
explicit BF16Vec8(const void* ptr) : reg(*(__vector signed short*)ptr) {}
explicit BF16Vec8(const FP32Vec8&);
void save(void* ptr) const {
*reinterpret_cast<__vector signed short*>(ptr) = reg;
}
};
struct BF16Vec16 : public Vec<BF16Vec16> {
constexpr static int VEC_ELEM_NUM = 16;
ss16x8x2_t reg;
explicit BF16Vec16(const void* ptr) {
// Load 256 bits in two parts
reg.val[0] = (__vector signed short)vec_xl(0, (signed short*)ptr);
reg.val[1] = (__vector signed short)vec_xl(16, (signed short*)ptr);
}
explicit BF16Vec16(const FP32Vec16&);
void save(void* ptr) const {
// Save 256 bits in two parts
vec_xst(reg.val[0], 0, (signed short*)ptr);
vec_xst(reg.val[1], 16, (signed short*)ptr);
}
};
const static __vector signed short zero = vec_splats((signed short)0);
struct BF16Vec32 : public Vec<BF16Vec32> {
constexpr static int VEC_ELEM_NUM = 32;
ss16x8x4_t reg;
explicit BF16Vec32(const void* ptr)
: reg(*reinterpret_cast<const ss16x8x4_t*>(ptr)) {}
explicit BF16Vec32(ss16x8x4_t data) : reg(data) {}
explicit BF16Vec32(const BF16Vec8& vec8_data)
: reg({vec8_data.reg, vec8_data.reg, vec8_data.reg, vec8_data.reg}) {}
void save(void* ptr) const { *reinterpret_cast<ss16x8x4_t*>(ptr) = reg; }
};
struct FP32Vec4 : public Vec<FP32Vec4> {
constexpr static int VEC_ELEM_NUM = 4;
union AliasReg {
__vector float reg;
float values[VEC_ELEM_NUM];
};
__vector float reg;
explicit FP32Vec4(float v) : reg(vec_splats(v)) {}
explicit FP32Vec4() : reg(vec_splats(0.0f)) {}
explicit FP32Vec4(const float* ptr) : reg(vec_xl(0, ptr)) {}
explicit FP32Vec4(__vector float data) : reg(data) {}
explicit FP32Vec4(const FP32Vec4& data) : reg(data.reg) {}
};
struct FP32Vec8 : public Vec<FP32Vec8> {
constexpr static int VEC_ELEM_NUM = 8;
union AliasReg {
f32x4x2_t reg;
float values[VEC_ELEM_NUM];
};
f32x4x2_t reg;
explicit FP32Vec8(float v) {
reg.val[0] = vec_splats(v);
reg.val[1] = vec_splats(v);
}
explicit FP32Vec8() {
reg.val[0] = vec_splats(0.0f);
reg.val[1] = vec_splats(0.0f);
}
explicit FP32Vec8(const float* ptr) {
reg.val[0] = vec_xl(0, ptr);
reg.val[1] = vec_xl(16, ptr);
}
explicit FP32Vec8(f32x4x2_t data) : reg(data) {}
explicit FP32Vec8(const FP32Vec8& data) {
reg.val[0] = data.reg.val[0];
reg.val[1] = data.reg.val[1];
}
explicit FP32Vec8(const BF16Vec8& v) {
reg.val[0] = (__vector float)vec_mergeh(zero, v.reg);
reg.val[1] = (__vector float)vec_mergel(zero, v.reg);
}
float reduce_sum() const {
AliasReg ar;
ar.reg = reg;
float result = 0;
unroll_loop<int, VEC_ELEM_NUM>(
[&result, &ar](int i) { result += ar.values[i]; });
return result;
}
FP32Vec8 exp() const {
// TODO: Vectorize this
AliasReg ar;
ar.reg = reg;
f32x4x4_t ret;
ret.val[0][0] = std::exp(ar.values[0]);
ret.val[0][1] = std::exp(ar.values[1]);
ret.val[0][2] = std::exp(ar.values[2]);
ret.val[0][3] = std::exp(ar.values[3]);
ret.val[1][0] = std::exp(ar.values[4]);
ret.val[1][1] = std::exp(ar.values[5]);
ret.val[1][2] = std::exp(ar.values[6]);
ret.val[1][3] = std::exp(ar.values[7]);
return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
}
FP32Vec8 tanh() const {
// TODO: Vectorize this
AliasReg ar;
ar.reg = reg;
f32x4x4_t ret;
ret.val[0][0] = std::tanh(ar.values[0]);
ret.val[0][1] = std::tanh(ar.values[1]);
ret.val[0][2] = std::tanh(ar.values[2]);
ret.val[0][3] = std::tanh(ar.values[3]);
ret.val[1][0] = std::tanh(ar.values[4]);
ret.val[1][1] = std::tanh(ar.values[5]);
ret.val[1][2] = std::tanh(ar.values[6]);
ret.val[1][3] = std::tanh(ar.values[7]);
return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
}
FP32Vec8 er() const {
// TODO: Vectorize this
AliasReg ar;
ar.reg = reg;
f32x4x4_t ret;
ret.val[0][0] = std::erf(ar.values[0]);
ret.val[0][1] = std::erf(ar.values[1]);
ret.val[0][2] = std::erf(ar.values[2]);
ret.val[0][3] = std::erf(ar.values[3]);
ret.val[1][0] = std::erf(ar.values[4]);
ret.val[1][1] = std::erf(ar.values[5]);
ret.val[1][2] = std::erf(ar.values[6]);
ret.val[1][3] = std::erf(ar.values[7]);
return FP32Vec8(f32x4x2_t({ret.val[0], ret.val[1]}));
}
FP32Vec8 operator*(const FP32Vec8& b) const {
return FP32Vec8(
{vec_mul(reg.val[0], b.reg.val[0]), vec_mul(reg.val[1], b.reg.val[1])});
}
FP32Vec8 operator+(const FP32Vec8& b) const {
return FP32Vec8(
{vec_add(reg.val[0], b.reg.val[0]), vec_add(reg.val[1], b.reg.val[1])});
}
FP32Vec8 operator-(const FP32Vec8& b) const {
return FP32Vec8(
{vec_sub(reg.val[0], b.reg.val[0]), vec_sub(reg.val[1], b.reg.val[1])});
}
FP32Vec8 operator/(const FP32Vec8& b) const {
return FP32Vec8(
{vec_div(reg.val[0], b.reg.val[0]), vec_div(reg.val[1], b.reg.val[1])});
}
void save(float* ptr) const {
vec_xst(reg.val[0], 0, ptr);
vec_xst(reg.val[1], 16, ptr);
}
};
struct FP32Vec16 : public Vec<FP32Vec16> {
constexpr static int VEC_ELEM_NUM = 16;
union AliasReg {
f32x4x4_t reg;
float values[VEC_ELEM_NUM];
};
f32x4x4_t reg;
explicit FP32Vec16(float v) {
reg.val[0] = vec_splats(v);
reg.val[1] = vec_splats(v);
reg.val[2] = vec_splats(v);
reg.val[3] = vec_splats(v);
}
explicit FP32Vec16() {
reg.val[0] = vec_splats(0.0f);
reg.val[1] = vec_splats(0.0f);
reg.val[2] = vec_splats(0.0f);
reg.val[3] = vec_splats(0.0f);
}
explicit FP32Vec16(const float* ptr) {
reg.val[0] = vec_xl(0, ptr);
reg.val[1] = vec_xl(16, ptr);
reg.val[2] = vec_xl(32, ptr);
reg.val[3] = vec_xl(48, ptr);
}
explicit FP32Vec16(f32x4x4_t data) : reg(data) {}
explicit FP32Vec16(const FP32Vec16& data) {
reg.val[0] = data.reg.val[0];
reg.val[1] = data.reg.val[1];
reg.val[2] = data.reg.val[2];
reg.val[3] = data.reg.val[3];
}
explicit FP32Vec16(const FP32Vec4& data) {
reg.val[0] = data.reg;
reg.val[1] = data.reg;
reg.val[2] = data.reg;
reg.val[3] = data.reg;
}
explicit FP32Vec16(const FP32Vec8& data) {
reg.val[0] = data.reg.val[0];
reg.val[1] = data.reg.val[1];
reg.val[2] = data.reg.val[0];
reg.val[3] = data.reg.val[1];
}
explicit FP32Vec16(const BF16Vec16& v) {
reg.val[0] = (__vector float)vec_mergeh(zero, v.reg.val[0]);
reg.val[1] = (__vector float)vec_mergel(zero, v.reg.val[0]);
reg.val[2] = (__vector float)vec_mergeh(zero, v.reg.val[1]);
reg.val[3] = (__vector float)vec_mergel(zero, v.reg.val[1]);
}
explicit FP32Vec16(const BF16Vec8& v) : FP32Vec16(FP32Vec8(v)) {}
FP32Vec16 operator*(const FP32Vec16& b) const {
return FP32Vec16(f32x4x4_t({vec_mul(reg.val[0], b.reg.val[0]),
vec_mul(reg.val[1], b.reg.val[1]),
vec_mul(reg.val[2], b.reg.val[2]),
vec_mul(reg.val[3], b.reg.val[3])}));
}
FP32Vec16 operator+(const FP32Vec16& b) const {
return FP32Vec16(f32x4x4_t({vec_add(reg.val[0], b.reg.val[0]),
vec_add(reg.val[1], b.reg.val[1]),
vec_add(reg.val[2], b.reg.val[2]),
vec_add(reg.val[3], b.reg.val[3])}));
}
FP32Vec16 operator-(const FP32Vec16& b) const {
return FP32Vec16(f32x4x4_t({vec_sub(reg.val[0], b.reg.val[0]),
vec_sub(reg.val[1], b.reg.val[1]),
vec_sub(reg.val[2], b.reg.val[2]),
vec_sub(reg.val[3], b.reg.val[3])}));
}
FP32Vec16 operator/(const FP32Vec16& b) const {
return FP32Vec16(f32x4x4_t({vec_div(reg.val[0], b.reg.val[0]),
vec_div(reg.val[1], b.reg.val[1]),
vec_div(reg.val[2], b.reg.val[2]),
vec_div(reg.val[3], b.reg.val[3])}));
}
float reduce_sum() const {
AliasReg ar;
ar.reg = reg;
float result = 0;
unroll_loop<int, VEC_ELEM_NUM>(
[&result, &ar](int i) { result += ar.values[i]; });
return result;
}
template <int group_size>
float reduce_sub_sum(int idx) {
static_assert(VEC_ELEM_NUM % group_size == 0);
AliasReg ar;
ar.reg = reg;
float result = 0;
const int start = idx * group_size;
unroll_loop<int, group_size>(
[&result, &start, ar](int i) { result += ar.values[start + i]; });
return result;
}
void save(float* ptr) const {
vec_xst(reg.val[0], 0, ptr);
vec_xst(reg.val[1], 16, ptr);
vec_xst(reg.val[2], 32, ptr);
vec_xst(reg.val[3], 48, ptr);
}
};
template <typename T>
struct VecType {
using vec_type = void;
};
template <typename T>
using vec_t = typename VecType<T>::vec_type;
template <>
struct VecType<float> {
using vec_type = FP32Vec8;
};
template <>
struct VecType<c10::BFloat16> {
using vec_type = BF16Vec8;
};
template <typename T>
void storeFP32(float v, T* ptr) {
*ptr = v;
}
inline void fma(FP32Vec16& acc, FP32Vec16& a, FP32Vec16& b) {
acc = acc + a * b;
}
namespace c10 {
struct BFloat16 {
uint16_t value; // Assume BFloat16 is defined as a struct containing a 16-bit
// value.
};
} // namespace c10
template <>
inline void storeFP32<c10::BFloat16>(float v, c10::BFloat16* ptr) {
c10::BFloat16 __attribute__((__may_alias__))* v_ptr =
reinterpret_cast<c10::BFloat16*>(&v);
*ptr = *(v_ptr + 1);
}
#ifndef __VEC_CLASS_FP_NAN
#define __VEC_CLASS_FP_NAN (1 << 6)
#endif
const static __vector unsigned char omask = {2, 3, 6, 7, 10, 11, 14, 15,
18, 19, 22, 23, 26, 27, 30, 31};
const static __vector unsigned int bias = {0x00007fff, 0x00007fff, 0x00007fff,
0x00007fff};
const static __vector unsigned int nan = {0x7fc00000, 0x7fc00000, 0x7fc00000,
0x7fc00000};
const static __vector unsigned int sh16 = {16, 16, 16, 16};
const static __vector unsigned int one = {1, 1, 1, 1};
inline BF16Vec8::BF16Vec8(const FP32Vec8& v) {
__vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
__vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
int cc;
__vector __bool int sel0 =
vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
__vector __bool int sel1 =
vec_fp_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN, &cc);
inp0 = vec_sel(inp0, nan, sel0) >> sh16;
inp1 = vec_sel(inp1, nan, sel1) >> sh16;
reg = (__vector signed short)vec_perm(inp0, inp1, omask);
}
inline BF16Vec16::BF16Vec16(const FP32Vec16& v) {
__vector unsigned int inp0 = (__vector unsigned int)(v.reg.val[0]);
__vector unsigned int inp1 = (__vector unsigned int)(v.reg.val[1]);
__vector unsigned int inp2 = (__vector unsigned int)(v.reg.val[2]);
__vector unsigned int inp3 = (__vector unsigned int)(v.reg.val[3]);
int cc;
__vector __bool int sel0 =
vec_fp_test_data_class(v.reg.val[0], __VEC_CLASS_FP_NAN, &cc);
__vector __bool int sel1 =
vec_fp_test_data_class(v.reg.val[1], __VEC_CLASS_FP_NAN, &cc);
__vector __bool int sel2 =
vec_fp_test_data_class(v.reg.val[2], __VEC_CLASS_FP_NAN, &cc);
__vector __bool int sel3 =
vec_fp_test_data_class(v.reg.val[3], __VEC_CLASS_FP_NAN, &cc);
inp0 = vec_sel(inp0, nan, sel0) >> sh16;
inp1 = vec_sel(inp1, nan, sel1) >> sh16;
inp2 = vec_sel(inp2, nan, sel2) >> sh16;
inp3 = vec_sel(inp3, nan, sel3) >> sh16;
reg.val[0] = (__vector signed short)vec_perm(inp0, inp1, omask);
reg.val[1] = (__vector signed short)vec_perm(inp2, inp3, omask);
}
inline void prefetch(const void* addr) { void __dcbt(const void* addr); }
}; // namespace vec_op
#endif

View File

@ -16,9 +16,18 @@ namespace vec_op {
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)
#define VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(...) \
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
AT_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__)
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
AT_DISPATCH_SWITCH(TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
#define VLLM_DISPATCH_FLOATING_TYPES_WITH_E5M2(TYPE, NAME, ...) \
AT_DISPATCH_SWITCH(TYPE, NAME, \
VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(__VA_ARGS__))
#ifndef CPU_OP_GUARD
#define CPU_KERNEL_GUARD_IN(NAME)
#define CPU_KERNEL_GUARD_OUT(NAME)

Some files were not shown because too many files have changed in this diff Show More