Compare commits

...

624 Commits

Author SHA1 Message Date
0fc8fa751a fix: gptq marlin weight loading failure (#23066) 2025-08-17 15:56:07 -07:00
21e39436c8 [XPU] fix xpu to set cudagraph batch sizes (#23044)
Signed-off-by: calvin chen <wen.chen@dynamia.ai>
2025-08-17 21:45:42 +00:00
6d243efeda [Misc] Convert use_structured_output property into constant (#23060)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-17 12:41:38 -07:00
c55bc1db26 [Misc] Remove dead return (#23061)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-17 10:36:46 -07:00
292084e72a [BugFix] Fix for IMA in FA3 varlen combine (#22967)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-08-17 08:52:04 -07:00
16bff144be [Misc] fix typo in the multimodal doc (#23051) 2025-08-17 01:56:20 -07:00
fe0411fc6f [Bugfix] should use stack instead of concat (#22972)
Signed-off-by: 947132885 <947132885@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-17 08:46:36 +00:00
4d4061b6e7 [Kernel] Add cuda kernel for gpt_oss activation (#22951)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-17 05:03:24 +00:00
87f48623a5 [Misc] method name typo fix (#23042)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-08-16 21:49:14 -07:00
5c32143b9d [Refactor] Defer tensor data construction in MultiModalKwargs (#23030)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-16 21:05:50 -07:00
94096a47c9 [UX] Separate marlin moe config logic from triton moe (#23006) 2025-08-16 22:16:42 -04:00
a258ad8bcc [Bugfix] fix qwen3 moe fp8 accuracy issue (#23031)
Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
2025-08-16 17:41:23 -07:00
bf7f470b22 [V1] Logits processors extensibility (#19912)
Signed-off-by: Andrew Feldman <afeldman@redhat.com>
Signed-off-by: Andrew Feldman <afeld2012@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Andrew Feldman <afeld2012@gmail.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-16 12:59:17 -07:00
4fc722eca4 [Kernel/Quant] Remove AQLM (#22943)
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-08-16 19:38:21 +00:00
3253ae765e [Flaky CI] Increase timeout tolerance for test_mp_crash_detection+test_default_mm_lora_chat_completions (#23028)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-16 18:33:08 +00:00
000cceca8c [Bugfix gpt-oss] Fix float32 convert for flashinfer sink support (#23016)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-16 11:16:00 -07:00
68373d3126 [Frontend] Added support for HermesToolParser for models without special tokens (#16890)
Signed-off-by: minpeter <kali2005611@gmail.com>
2025-08-16 17:38:42 +00:00
52ce1420e9 Fix handling of max_num_batched_tokens for pooling tasks (#23004)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
2025-08-16 17:36:30 +00:00
829bbd7882 [New Model]mBART model (#22883)
Signed-off-by: 汪志鹏 <wangzhipeng628@gmail.com>
2025-08-16 12:16:58 +00:00
4dff91c93d [Refactor] Allow optional MultiModalKwargsItem in IPC (#23022)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-16 11:30:49 +00:00
de9cb61763 Add docs for PrefixRepetitionDataset + enable usage with vllm bench throughput (#23012)
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
2025-08-16 10:21:20 +00:00
2dbccce8a6 [CI][Bugfix] Skip Ovis2 generation test because of broken remote code (#22954)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-16 09:44:19 +00:00
933f45334a [Core] Make cudagraph check cuda platform only (#23005)
Signed-off-by: Chengji Yao <chengjiyao@gmail.com>
Signed-off-by: Chengji Yao <chengjiyao@google.com>
Co-authored-by: Chengji Yao <chengjiyao@gmail.com>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
2025-08-16 07:46:00 +00:00
cc826a202b [Multimodal] Update Tensor schema test to cover arbitrary shape mm inputs (#22867)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-16 00:44:50 -07:00
6d3da472bc [Misc] Add --save-dir option to benchmark_moe (#23020)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-16 07:26:10 +00:00
78863f8c5c [BugFix] Add support for loading prompt embeds tensors serialized on unavailable devices and sparse tensors (#22962)
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
2025-08-16 06:25:10 +00:00
5157827cfc [Build] Env var to disable sccache (#22968)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-08-16 05:36:27 +00:00
7caec10e7b [XPU]avoid circular import during XPU init (#23017)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-08-16 05:16:34 +00:00
1f83e7d849 [misc] nsys profile output kernel classifier and visualizer (#22971)
Signed-off-by: Grace Ho <grho@nvidia.com>
2025-08-16 02:52:51 +00:00
e4e37ded56 [V1] support min_tokens for detokener (#22014)
Signed-off-by: calvin chen <wen.chen@dynamia.ai>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-08-16 02:28:10 +00:00
f6b5040590 [Frontend] Avoid list copies in serving_chat.py (#22947)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-16 02:06:30 +00:00
fbd88728b3 [Bugfix] Fix DeepSeek MTP (#22934)
Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
2025-08-16 01:25:06 +00:00
070da660c1 [Kernel] Simplify get_kv_cache_layout and cache use_trtllm_attention env-dependent bit (#22735)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-08-16 00:14:08 +00:00
ad0297d113 [Misc] Support passing multiple request ids at once to AsyncLLM.abort() (#22944)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-15 17:00:36 -07:00
236b864e4f [BugFix] Make run_once thread-safe (#22978)
Signed-off-by: <wenji.yyc@alibaba-inc.com>
Signed-off-by: Yichen Yan <wenji.yyc@alibaba-inc.com>
2025-08-15 16:56:17 -07:00
3e2f7985a2 Support multiple attention groups for KV sharing (#22672)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-08-15 16:54:10 -07:00
c280066f9d [v1] Move block_hashes from KVCacheManager to Request.block_hashes (#19728)
Signed-off-by: Or Ozeri <oro@il.ibm.com>
2025-08-15 16:52:52 -07:00
b9dc9d2607 [BugFix] Handle case where async utility call is cancelled (#22996)
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Yinghai Lu <yinghai@thinkingmachines.ai>
2025-08-15 17:38:42 -06:00
1fc375dc05 [Structured Outputs] [Bug] Fix misalignment in apply_grammar_bitmask causing unintended masking and NaN logits (#22963)
Signed-off-by: rishitdholakia13 <rishit+github@cohere.com>
2025-08-15 23:25:05 +00:00
76144adf76 ci: Add CUDA + arm64 release builds (#21201)
Signed-off-by: Eli Uriegas <eliuriegas@meta.com>
2025-08-15 23:16:23 +00:00
f5d412bafb [BugFix] Fix regression caused by mamba state dtype PR (#22998)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-08-15 22:55:26 +00:00
177e55e3bd [Attention] FA3 Attention Sinks Perf Boost (#22478)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-08-15 17:41:07 -04:00
1723ef1aae minor: zero workspace buffer init for flashinfer trtllm-gen attn (#22603) 2025-08-15 21:38:10 +00:00
00d6cba0cf Add PrefixRepetitionRandomDataset to vllm bench serve datasets (#20638)
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
2025-08-15 14:09:23 -07:00
7f89ed248f [Fix] enable swap_ab for pplx problem size computation (#22991)
Signed-off-by: Shixian Cui <shixian@amazon.com>
Co-authored-by: Shixian Cui <shixian@amazon.com>
2025-08-15 14:02:12 -07:00
8a87cd27d9 [CI] Speed up Whisper tests by reusing server (#22859)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-15 16:56:31 -04:00
a344a1a7da Use regex in convert-results-json-to-markdown.py (#22989)
Signed-off-by: Michael Goin <mgoin64@gmail.com>
2025-08-15 20:54:20 +00:00
79899b63f6 [Bugfix] Added more env vars to hash (#22449)
Signed-off-by: Julien Lin <jullin@nvidia.com>
2025-08-15 20:08:37 +00:00
6e670778cd [Core] direct indexing on self.block_table_np in compute_slot_mapping (#22940)
Signed-off-by: linzebing <linzebing1995@gmail.com>
2025-08-15 12:12:12 -07:00
df5afa82e5 [Log] Debug Once for Randomizing dummy data for DP Rank (#22860)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-15 11:51:50 -07:00
6cd69f51bf [Model] Granite-4 support loading quantized checkpoint (#22925)
Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
2025-08-15 18:47:56 +00:00
8ad7285ea2 [Kernels] Clean up FusedMoeMethodBase and modular kernel setup. Remove extra arguments from modular kernel methods. (#22035)
Signed-off-by: Bill Nell <bnell@redhat.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-08-15 14:46:00 -04:00
48b01fd4d4 [Structured Output] Make the output of structured output example more complete (#22481)
Signed-off-by: shen-shanshan <467638484@qq.com>
2025-08-15 18:29:25 +00:00
993d3d122b [Benchmarks] Include image data when ShareGPT4V dataset is used. (#22955)
Signed-off-by: Chenheli Hua <huachenheli@outlook.com>
2025-08-15 18:23:06 +00:00
68af77e51c [FIXBUG] Correctly Apply Grammar Bitmask in Mixed Batches (#22896)
Signed-off-by: JartX <sagformas@epdcenter.es>
2025-08-15 17:42:49 +00:00
6b04039a72 [BugFix] Skip the Q component for QKVParallelLinear in the case of QKVCrossParallelLinear since its width is 0 (#22369)
Signed-off-by: sstamenk <sstamenk@amd.com>
2025-08-15 17:17:31 +00:00
1c859a1387 [V0 Deprecation] Remove advance_step (#22969)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-15 08:22:31 -07:00
74f441f4b5 [Core] Allow full cudagraph with separate attention routines and orthogonal to compilation, add support for FA2 and FlashInfer (#20059)
Signed-off-by: fhl <2410591650@qq.com>
Signed-off-by: fhl2000 <63384265+fhl2000@users.noreply.github.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Luka Govedič <ProExpertProg@users.noreply.github.com>
Co-authored-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
2025-08-15 10:01:39 -04:00
a0632a3e03 [Frontend] Expose do_log_stats interval to env (#22905)
Signed-off-by: Csrayz <jover@cmbchina.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-15 13:00:20 +00:00
e8b40c7fa2 [CI] Remove duplicated docs build from buildkite (#22924)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-15 05:58:06 -07:00
48f4636927 [Misc] Ignore ep_kernels_workspace (#22807)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-15 05:58:03 -07:00
75531a6c13 [V1] [Hybrid] Support using float32 for state in Hybrid Models (Mamba2, Mamba1, Minimax) (#22928)
Signed-off-by: Daniel Afrimi <danielafrimi8@gmail.com>
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Daniel Afrimi <danielafrimi8@gmail.com>
Co-authored-by: Burkhard Ringlein <ngl@zurich.ibm.com>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
2025-08-15 12:57:06 +00:00
22341b996e Improve multimodal hasher performance for re-used Image prompts (#22825)
Signed-off-by: Staszek Pasko <staszek@gmail.com>
2025-08-15 12:32:56 +00:00
49252cf59e [MM] Allow skipping memory profiling for multimodal models. (#22950)
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-08-15 11:41:38 +00:00
3e6dd40016 [Bugfix] fix cuda 12.6 and 11.8 build (#22952)
Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
2025-08-15 10:10:22 +00:00
aa300c438d [Bugfix] Unquote file uri before reading image (#22912)
Signed-off-by: Sayandip Dutta <sayandip199309@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-08-15 09:28:00 +00:00
fe91ce9591 [V1] - Split Prefill and Decode for Mamba1 models (#22653)
Signed-off-by: amirk <amirk@ai21.com>
Signed-off-by: asafg <asafg@ai21.com>
Co-authored-by: asafg <asafg@ai21.com>
Co-authored-by: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com>
2025-08-15 08:59:52 +00:00
5406ebf5c9 [CI] Pooling models mteb test uses enforce_eager (#22878)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-08-15 01:16:15 -07:00
b2c06509e5 [P/D]Provide bucket algorithm rate limiter for proxy_server (#22643)
Signed-off-by: frankie-ys <yongshengwang@cmbchina.com>
Signed-off-by: frankie <wangyongsheng686@gmail.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Kuntai Du <kuntai@uchicago.edu>
2025-08-15 07:01:48 +00:00
b2f6c247a9 Revert "[ROCm][AITER] Support AITER Rope ops in RotaryEmbedding Module." (#22956)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-08-15 06:39:19 +00:00
3d232dbd19 [Mamba] - refactor: Renamed mamba_attn to mamba2_attn (#22818)
Signed-off-by: asafg <asafg@ai21.com>
Co-authored-by: asafg <asafg@ai21.com>
2025-08-15 06:38:05 +00:00
5c3fbfe46b [Feature] Full Cuda Graph Support for Cutlass MLA and 6% E2E Throughput Improvement (#22763)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-15 06:27:30 +00:00
b4cef5e6c7 refactor: Change scaling factors calculation for flashinfer FusedMoE (#22812)
Signed-off-by: Amir Klein <203507526+amirkl94@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-08-15 06:19:31 +00:00
0fe85087a9 [CI Perf] Prune tests in tests/kernels/attention/ (#22936)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-14 21:34:53 -06:00
d2b0e97ea6 [CI Perf] Prune tests in tests/kernels/moe/ (#22939)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-14 21:33:42 -06:00
590bddbfc5 [CI Perf] Prune tests in tests/kernels/quantization/ (#22942)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-14 21:25:34 -06:00
ae05a6d83d [BugFix] Fix port lookup in internal DP LB tests (#22252)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-15 11:17:11 +08:00
0933f9d518 [BugFix][KVConn] Fix use of get_required_kvcache_layout (#22734)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-15 01:39:43 +00:00
f1f0d2fab8 Revert "[Kernel] Add cuda kernel for gpt_oss activation" (#22948) 2025-08-14 17:38:10 -07:00
81f4b96481 [Kernel] Add cuda kernel for gpt_oss activation (#22538)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-14 17:21:29 -07:00
39cd09dc86 [Bugfix] use flash attn on sm90 (#22933)
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
2025-08-14 16:37:22 -07:00
919234fe17 [BugFix] Fix initial DP request load imbalance (#22910)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-14 15:20:28 -07:00
ebcce2cd36 [Core] Return final response for aborted requests from AsyncLLM.generate (#22283)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-14 14:49:02 -07:00
4121de512e [Quantization]: Support compressed-tensors mixed-precision model loading (#22468)
Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
2025-08-14 17:32:09 -04:00
279a5f31b3 [Kernel] Add nvfp4 gemm flashinfer backends (#22346)
Signed-off-by: Julien Lin <jullin@nvidia.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-08-14 16:03:55 -04:00
b8ff05361a [CI] Temporarily disable flaky test (#22930)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-08-14 19:59:16 +00:00
Nir
637093ae26 docs: update fastsafetensors usage instructions (#22891)
Signed-off-by: Nir Levy <bhr166@gmail.com>
2025-08-14 19:56:54 +00:00
33c63e9547 [Kernel] [Quantization] Add MXFP4 and bias support for marlin kernel (#22428)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
Signed-off-by: Huzaifa Sidhpurwala <huzaifas@redhat.com>
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Signed-off-by: Animesh Jain <anijain@umich.edu>
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
Signed-off-by: Nick Hill <nhill@redhat.com>
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Signed-off-by: kf <kuanfu.liu@embeddedllm.com>
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
Signed-off-by: Sage Moore <sage@neuralmagic.com>
Signed-off-by: tjtanaavllm <tunjian.tan@amd.com>
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@centml.ai>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
Signed-off-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
Signed-off-by: yan <yan.ma@intel.com>
Signed-off-by: Yan Ma <yan.ma@intel.com>
Signed-off-by: Xiao Liu <xiszishu@gmail.com>
Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Signed-off-by: LopezCastroRoberto <roberto.lopez.castro@udc.es>
Signed-off-by: Andy Xie <andy.xning@gmail.com>
Signed-off-by: Haibin Lin <haibin.lin@bytedance.com>
Signed-off-by: David Ben-David <davidb@pliops.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: jiang1.li <jiang1.li@intel.com>
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Signed-off-by: Abirdcfly <fp544037857@gmail.com>
Signed-off-by: Giancarlo Delfin <gdelfin@meta.com>
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
Signed-off-by: huangweixiao <huangweixiao@msh.team>
Signed-off-by: alyosha-swamy <raghav@arcee.ai>
Signed-off-by: Eric Hanley <ericehanley@google.com>
Signed-off-by: Abatom <abzhonghua@gmail.com>
Signed-off-by: CLFutureX <775523362@qq.com>
Signed-off-by: Linkun Chen <github@lkchen.net>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
Signed-off-by: tlipoca9 <tlipoca9@gmail.com>
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
Signed-off-by: zitian zhao <zitian.zhao@tencentmusic.com>
Signed-off-by: mgoin <michael@neuralmagic.com>
Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: Benji Beck <benjibeck@meta.com>
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
Signed-off-by: isotr0py <2037008807@qq.com>
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Signed-off-by: simon-mo <xmo@berkeley.edu>
Signed-off-by: LucasWilkinson <lwilkinson@neuralmagic.com>
Signed-off-by: Zhang Jason <ning.zhang2@amd.com>
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Signed-off-by: asafg <asafg@ai21.com>
Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
Signed-off-by: Lain <fusiyuan2000@hotmail.com>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: QscQ <qscqesze@gmail.com>
Signed-off-by: qingjun <qingjun@minimaxi.com>
Signed-off-by: Syed Muhammad Bin Asif <syedmba7@connect.hku.hk>
Signed-off-by: Lionel Villard <villard@us.ibm.com>
Signed-off-by: ycyaw66 <497410282@qq.com>
Signed-off-by: David Chen <530634352@qq.com>
Signed-off-by: Linkun <github@lkchen.net>
Signed-off-by: Moritz Sanft <58110325+msanft@users.noreply.github.com>
Signed-off-by: Ming Yang <minos.future@gmail.com>
Signed-off-by: Adrian Garcia <adrian.garcia@inceptionai.ai>
Signed-off-by: shaojunqi <shaojunqi.sjq@alibaba-inc.com>
Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
Signed-off-by: Andrew Chan <andrewkchan.akc@gmail.com>
Signed-off-by: Felix Marty <Felix.Marty@amd.com>
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
Signed-off-by: Shu Wang <shuw@nvidia.com>
Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
Signed-off-by: Shu Wang. <shuw@nvidia.com>
Signed-off-by: XIn Li <xinli@nvidia.com>
Signed-off-by: Junhao Li <junhao@ubicloud.com>
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: iAmir97 <Amir.balwel@embeddedllm.com>
Signed-off-by: iAmir97 <71513472+iAmir97@users.noreply.github.com>
Signed-off-by: <zyy1102000@gmail.com>
Signed-off-by: Guy Stone <guys@spotify.com>
Signed-off-by: <yyweiss@gmail.com>
Signed-off-by: yyw <yyweiss@gmail.com>
Signed-off-by: Russell Bryant <rbryant@redhat.com>
Signed-off-by: Pradyun Ramadorai <pradyunr@amazon.com>
Signed-off-by: Pradyun92 <142861237+Pradyun92@users.noreply.github.com>
Signed-off-by: Jinzhen Lin <jinzhen.ljz@antgroup.com>
Co-authored-by: rongfu.leng <rongfu.leng@daocloud.io>
Co-authored-by: Huzaifa Sidhpurwala <huzaifas@redhat.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Animesh Jain <jainanimesh2305@yahoo.com>
Co-authored-by: Rui Qiao <161574667+ruisearch42@users.noreply.github.com>
Co-authored-by: XiongfeiWei <isaacwxf23@gmail.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
Co-authored-by: JartX <sagformas@gmail.com>
Co-authored-by: fhl2000 <63384265+fhl2000@users.noreply.github.com>
Co-authored-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: kf <kuanfu.liu@embeddedllm.com>
Co-authored-by: Nicolò Lucchesi <nlucches@redhat.com>
Co-authored-by: Dipika Sikka <dipikasikka1@gmail.com>
Co-authored-by: Sage Moore <sage@neuralmagic.com>
Co-authored-by: tjtanaavllm <tunjian.tan@amd.com>
Co-authored-by: Yong Hoon Shin <48474650+sarckk@users.noreply.github.com>
Co-authored-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Vadim Gimpelson <156319763+vadiklyutiy@users.noreply.github.com>
Co-authored-by: Yuxuan Zhang <2448370773@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Thomas Parnell <tpa@zurich.ibm.com>
Co-authored-by: Yan Ma <yan.ma@intel.com>
Co-authored-by: Xiao <xiszishu@gmail.com>
Co-authored-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Ye (Charlotte) Qi <yeq@meta.com>
Co-authored-by: Roberto L. Castro <38211239+LopezCastroRoberto@users.noreply.github.com>
Co-authored-by: Ning Xie <andy.xning@gmail.com>
Co-authored-by: H <linhaibin.eric@gmail.com>
Co-authored-by: David Ben-David <sdavidbd@gmail.com>
Co-authored-by: David Ben-David <davidb@pliops.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Li, Jiang <jiang1.li@intel.com>
Co-authored-by: TankNee <nee@tanknee.cn>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Seiji Eicher <58963096+eicherseiji@users.noreply.github.com>
Co-authored-by: ZiTian.Zhao <zitian.zhao@tencentmusic.com>
Co-authored-by: 22quinn <33176974+22quinn@users.noreply.github.com>
Co-authored-by: Abirdcfly <fp544037857@gmail.com>
Co-authored-by: Giancarlo Delfin <32987265+TheEpicDolphin@users.noreply.github.com>
Co-authored-by: Chenxi Yang <cxyang@cs.utexas.edu>
Co-authored-by: Chenxi Yang <cxyang@meta.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: Weixiao Huang <hwx.simle@gmail.com>
Co-authored-by: Raghav Ravishankar <113712354+alyosha-swamy@users.noreply.github.com>
Co-authored-by: ericehanley <ericehanley@google.com>
Co-authored-by: Zhonghua Deng <abzhonghua@gmail.com>
Co-authored-by: Po-Han Huang (NVIDIA) <53919306+nvpohanh@users.noreply.github.com>
Co-authored-by: PiteXChen <44110731+CLFutureX@users.noreply.github.com>
Co-authored-by: lkchen <github@lkchen.net>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
Co-authored-by: Gregory Shtrasberg <156009573+gshtras@users.noreply.github.com>
Co-authored-by: tlipoca9 <160737620+tlipoca9@users.noreply.github.com>
Co-authored-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
Co-authored-by: wang.yuqi <noooop@126.com>
Co-authored-by: Benji Beck <benjibeck@meta.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: Siyuan Liu <lsiyuan@google.com>
Co-authored-by: Benjamin Chislett <chislett.ben@gmail.com>
Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Zhang Jason <ning.zhang2@amd.com>
Co-authored-by: Asaf Joseph Gardin <39553475+Josephasafg@users.noreply.github.com>
Co-authored-by: asafg <asafg@ai21.com>
Co-authored-by: Lain <siyuanf@nvidia.com>
Co-authored-by: tc-mb <157115220+tc-mb@users.noreply.github.com>
Co-authored-by: imning3 <hbning@pku.edu.cn>
Co-authored-by: Maximilien de Bayser <mbayser@br.ibm.com>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
Co-authored-by: Tao He <linzhu.ht@alibaba-inc.com>
Co-authored-by: qscqesze <qingjun@minimaxi.com>
Co-authored-by: Syed Muhammad Bin Asif <92625830+syedmba@users.noreply.github.com>
Co-authored-by: Lionel Villard <villard@us.ibm.com>
Co-authored-by: WeiQing Chen <40507679+david6666666@users.noreply.github.com>
Co-authored-by: ycyaw66 <497410282@qq.com>
Co-authored-by: Moritz Sanft <58110325+msanft@users.noreply.github.com>
Co-authored-by: Ming Yang <minos.future@gmail.com>
Co-authored-by: Adrián García García <adrigarvk8@gmail.com>
Co-authored-by: Michael Goin <mgoin@redhat.com>
Co-authored-by: JaceyShao <65159281+JaceyShao@users.noreply.github.com>
Co-authored-by: shaojunqi <shaojunqi.sjq@alibaba-inc.com>
Co-authored-by: Ricardo Decal <crypdick@users.noreply.github.com>
Co-authored-by: Andrew Chan <andrewkchan.akc@gmail.com>
Co-authored-by: fxmarty-amd <felmarty@amd.com>
Co-authored-by: Andrew Sansom <andrew@protopia.ai>
Co-authored-by: Zhiyu <zhiyuc@nvidia.com>
Co-authored-by: Shu Wang <shuw@nvidia.com>
Co-authored-by: XIn Li <xinli@nvidia.com>
Co-authored-by: Junhao Li <streaver91@gmail.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
Co-authored-by: iAmir97 <71513472+iAmir97@users.noreply.github.com>
Co-authored-by: iAmir97 <Amir.balwel@embeddedllm.com>
Co-authored-by: Hong Hanh <hanh.usth@gmail.com>
Co-authored-by: Daniel Serebrenik <74646983+pliops-daniels@users.noreply.github.com>
Co-authored-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: Guy Stone <guys@spotify.com>
Co-authored-by: yyweiss <70619747+yyweiss@users.noreply.github.com>
Co-authored-by: Pradyun92 <142861237+Pradyun92@users.noreply.github.com>
Co-authored-by: Pradyun Ramadorai <pradyunr@amazon.com>
Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com>
2025-08-14 11:23:22 -07:00
ab9f2cfd19 [CI] [Hybrid] Bump min transformers version for Bamba and Jamba (#22908)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-08-14 11:01:16 -07:00
dbe298046c [Bugfix] Fix parsing of --disable-mm-preprocessor-cache (#22909)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-14 08:09:44 -07:00
625ccd1c4d [Bugfix] Replace custom Encoding class with BatchEncoding in MistralTokenizer (#22786)
Signed-off-by: zjy0516 <riverclouds.zhu@qq.com>
2025-08-14 08:09:27 -07:00
92ff41abea [Model] Modify the gate implementation of glm4_moe (#22832)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-14 05:28:50 -07:00
829b9a62d0 [Perf] Dont create unnecessary pooling params (#22876)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-08-14 05:28:09 -07:00
540d54ca8d [CI] Re-enable transcriptions test_long_audio_request (#22890)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-08-14 11:34:34 +00:00
0783f13960 [Doc] fix dead link (#22898)
Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
2025-08-14 04:06:13 -07:00
7655dc3e45 [Bugfix] Add reset prefix cache for online serving (#22726)
Signed-off-by: iAmir97 <Amir.balwel@embeddedllm.com>
Signed-off-by: iAmir97 <71513472+iAmir97@users.noreply.github.com>
Co-authored-by: iAmir97 <Amir.balwel@embeddedllm.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-08-14 04:04:18 -07:00
f4efda821d Remove Phi 4 Flash configuration workaround (#22723)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-14 04:03:49 -07:00
eb08487b18 [BugFix] Threadsafe close async zmq sockets (#22877)
Signed-off-by: Nick Hill <nhill@redhat.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-14 03:44:29 -07:00
7c3a0741c6 [Bugfix] Fix PixtralHFImagePixelInputs dynamic shape check (#22827)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-14 02:35:43 -07:00
00e3f9da46 vLLM Benchmark suite improvement (#22119)
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
Signed-off-by: Louie Tsai <louie.tsai@intel.com>
Co-authored-by: Li, Jiang <bigpyj64@gmail.com>
2025-08-14 07:12:17 +00:00
a353bd083d [CI] remove flaky v0 test (#22864)
Signed-off-by: Robert Shaw <robshaw@redhat.com>
Co-authored-by: Robert Shaw <robshaw@redhat.com>
2025-08-13 21:41:51 -07:00
1d20c34717 [CI] Fix tests/distributed/test_ca_buffer_sharing.py (#22849)
Signed-off-by: ilmarkov <imarkov@redhat.com>
Co-authored-by: ilmarkov <imarkov@redhat.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-08-13 20:09:30 -07:00
b6af24fba7 [CI][Entrypoints]: add filter to generation to filter out invalid tool calls (#22826)
Signed-off-by: Will Eaton <weaton@redhat.com>
2025-08-13 20:09:07 -07:00
0ca2393b47 [CI/Build] Increase pooling tolerance to pass CI (#22844)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-08-13 18:52:48 -04:00
31a500c86f [Core] [N-gram SD Optimization][1/n] Propose tokens with a single KMP (#22437)
Signed-off-by: Jialin Ouyang <Jialin.Ouyang@gmail.com>
2025-08-13 14:44:06 -07:00
4e8614e88b Move checklist in PR template (#22852)
Signed-off-by: Luka Govedic <lgovedic@redhat.com>
2025-08-13 21:38:35 +00:00
c6cd5ca3d3 [ROCm][Bugfix] Fix compilation error in topk softmax fused kernel (#22819)
Signed-off-by: kliuae <kuanfu.liu@embeddedllm.com>
2025-08-13 13:45:03 -07:00
df0e0f023e [CI/Build] Skip gpt_big model test because of broken HF model (#22848)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-13 20:36:28 +00:00
b4b78d6317 [CI/Build] Fix param mismatch in test_eagle_correctness (#22847)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-13 10:55:25 -07:00
12817a8ac7 [CI] Fix tests/v1/e2e/test_kv_sharing_fast_prefill.py import on test (#22815)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-08-13 10:35:50 -07:00
c9232d41f4 [CI/Build] Update VLM common tests (#22841)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-13 10:03:05 -07:00
HWH
9bd9294f0e [Bugfix] Fix MiniCPMV Image input inference failed (#22813)
Signed-off-by: HWH <67449739+jio-H@users.noreply.github.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-08-13 09:41:41 -07:00
da2705198f [Misc] clear and separate error messages for input too long and input + max-tokens too long (#22803)
Signed-off-by: Roger Wang <hey@rogerw.me>
2025-08-13 07:22:56 -07:00
19b927e52d [Core] Use individual MM items in P0/P1 cache and model runner (#22570)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-13 07:18:07 -07:00
20d65aa755 [Frontend] Multithreaded async multimodal load_bytes (#22710)
Signed-off-by: Alexandre Milesi <30204471+milesial@users.noreply.github.com>
Co-authored-by: Alexandre Milesi <30204471+milesial@users.noreply.github.com>
2025-08-13 06:09:26 -07:00
b159c0a67a Fix GGUF loader for Qwen3 MoE. (#22785)
Signed-off-by: Gh0u1L5 <Gh0u1L5@outlook.com>
2025-08-13 06:08:23 -07:00
6772bb0f7d Remove unnecessary CUDA sync of qwen image and video preprocess (#22792)
Signed-off-by: cyy <cyyever@outlook.com>
Signed-off-by: Yuanyuan Chen <cyyever@outlook.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-08-13 06:07:28 -07:00
fceafaf582 [Bugfix][mamba] Fix type annotation of Mamba2Metadata (#22787)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-08-13 06:07:09 -07:00
6b794c756c [Nixl][CI] Fix tests (#22806)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-08-13 06:03:53 -07:00
98deac3879 [FEATURE] support custom vllm tuned config path for fused moe triton kernels (#22791)
Signed-off-by: Chi Zhang <zhangchi.usc1992@bytedance.com>
2025-08-13 20:27:25 +08:00
653124bd46 [Frontend] Add chunked processing to handle long inputs in embedding models (#22280)
Signed-off-by: x22x22 <wadeking@qq.com>
Signed-off-by: Kdump <rootshellexp@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Maximilien de Bayser <maxdebayser@gmail.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-13 04:14:24 -07:00
0b1bdac6af [Platform] Custom ops support for FusedMoe (#22509)
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
2025-08-13 04:12:00 -07:00
d94e3026de [V1] Add tree drafting tests for eagle spec decoding (#22705)
Signed-off-by: Giancarlo Delfin <gdelfin@meta.com>
2025-08-13 04:11:28 -07:00
3f52738dce [Doc] Add max_lora_rank configuration guide (#22782)
Signed-off-by: chiliu <cliu_whu@yeah.net>
2025-08-13 04:10:07 -07:00
a01e0018b5 [Bugfix] Fix Nemotron VL image processing (#22739)
Co-authored-by: ducviet00-h2 <viet.d.hoang@h2corporation.jp>
2025-08-13 03:11:36 -07:00
9e7e5baaa8 [Model] Add missing prefix to glm4_1v (#22716)
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
2025-08-13 01:23:33 -07:00
d16aa3dae4 [Model] Add option to run Step3VisionEncoder in DP (#22697)
Signed-off-by: zzh142857 <chaorenzhaozhenghao@gmail.com>
2025-08-13 00:09:13 -07:00
6807af8f46 [gpt-oss] upgrade gpt-oss to v0.0.3 and add version check (#22768)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-08-12 21:37:26 -07:00
4c558cf62e [Perf] Support topk softmax fused kernel for broader num_experts (#22211)
Signed-off-by: Shixian Cui <shixian@amazon.com>
Co-authored-by: Shixian Cui <shixian@amazon.com>
2025-08-12 21:34:47 -07:00
77a6bf07ae [Bug] Fix Unexpected Keyword Argument 'w1_bias' (#22757)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-12 21:31:47 -07:00
4082338a25 Remove unneeded ROCm platform import when using CUDA (#22765)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-12 21:26:38 -07:00
c6b928798e Force TRTLLM attention for gpt-oss on SM100 (#22678)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-12 21:22:16 -07:00
b1361c7273 [Bugfix] Fix default enable for CUTLASS MLA on SM100 (#22738)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-12 21:22:05 -07:00
4f0f844b16 Fix cuda illegal mem access with Llama4 TP8 + rms_norm custom op (#22701)
Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
2025-08-12 21:21:50 -07:00
c5830381af [V0 Deprecation] Remove args for multi-step scheduling (#22779)
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
2025-08-12 20:38:18 -07:00
d31f97cf57 [Misc] Remove tests/multi_step/__init__.py (#22778)
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
2025-08-12 20:21:18 -07:00
71683ca6f6 [V0 Deprecation] Remove multi-step scheduling (#22138)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Woosuk Kwon <woosuk@thinkingmachines.ai>
2025-08-12 20:18:39 -07:00
e18859298d Add hardware plugins to installation doc (#22732)
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-12 17:14:46 -07:00
fde0b611a3 [Model] Decouple glm4v (#22751)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-12 17:13:17 -07:00
d0a6301588 Fix Transformers backend tensor parallel for multimodal models (#22673)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-12 17:12:30 -07:00
45c3936e94 [Docs] Hide the navigation and toc sidebars on home page (#22749)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-12 17:12:26 -07:00
ba81acbdc1 [Bugfix] Bump DeepGEMM Version to Fix SMXX Layout Issues (#22606)
Signed-off-by: frankwang28 <frank.wbb@hotmail.com>
2025-08-12 15:43:06 -07:00
53c730286c [Misc] parametrize 'dtype' in test_flash_mla (#22641)
Signed-off-by: RUTHLESS-BOT <wujiafeng@cmbchina.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-08-12 16:31:48 -04:00
6534d2fc97 Fix torch version check for SM100 mxfp4 (#22535)
Signed-off-by: Zifei Tong <zifeitong@gmail.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-08-12 12:54:42 -07:00
422f22e012 [CI][Nixl] Check kv cache layout during handshake (#22745)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-08-12 12:53:52 -07:00
6bd8ebf026 [Kernel][AMD] Avoid D2H copy and cumsum kernel (#22683)
Signed-off-by: Xiaozhu <mxz297@gmail.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-08-12 12:53:36 -07:00
dab4f9f764 [Chore] Update CODEOWNERS to include @yewentao256 for CUDA kernels, attention backends, quantization, and related tests (#22741)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-13 00:50:31 +08:00
c42fe0b63a Add more test scenario for tensor schema (#22733)
Signed-off-by: teekenl <teekenlau@gmail.com>
2025-08-12 16:34:41 +00:00
5a4b4b3729 Add: SupportsEagle3 interface for explicit EAGLE3 support (#22642)
Signed-off-by: Rahul Tuli <rtuli@redhat.com>
2025-08-12 09:24:52 -07:00
e5d3d63c42 [Benchmark] Fix terminal colors in benchmark_serving_multi_turn (python 3.12) (#22730)
Signed-off-by: daniels <daniels@pliops.com>
2025-08-12 14:41:37 +00:00
3d9d40efde [Bugfix][CI] Fix test_remote_decode_lifecycle.py::test_short_prompt_lifecycle (#22727)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-08-12 07:30:17 -07:00
67c153b88a Fix Llama4 FlashInfer FP4 MoE issues (#22511)
Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
2025-08-12 05:50:59 -07:00
f7ad6a1eb3 [CI Failure] fix tests/entrypoints/openai/test_skip_tokenizer.py (#22708)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-08-12 05:42:58 -07:00
80bb1e8afe Officially support SmolLM3 using the Transformers backend (#22665)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-12 05:38:48 -07:00
d030b01548 [BugFix][Nixl][PD] Fix heterogenous TP (#22663)
Signed-off-by: NickLucche <nlucches@redhat.com>
Co-authored-by: Nick Hill <nhill@redhat.com>
2025-08-12 05:37:30 -07:00
767e63b860 [Docs] Improve docs navigation (#22720)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-12 04:25:55 -07:00
007dd90859 [gpt-oss] Enable gpt-oss on ampere (#22714)
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
2025-08-12 03:21:44 -07:00
b8a9d0e429 [Misc] remove GH discussions link (#22722)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-12 03:15:33 -07:00
50f2aae1b4 [LMCache][Example] Align the PYTHONHASHSEED for prefillers and decoders for KV chunks hashing (#21161)
Signed-off-by: zejunchen-zejun <zejun.chen@amd.com>
2025-08-12 02:05:14 -07:00
46ae7f6666 [Bugfix] Mamba2 SSD varlen bug fix initstates decay, improve test, assert chunk pwr 2 (#21783)
Signed-off-by: Rishi Astra <40644327+RishiAstra@users.noreply.github.com>
2025-08-12 02:04:37 -07:00
1ece7f30ba Fix: AWQ Marlin get_quant_method does not recognize "modules_to_not_convert" (#21888)
Signed-off-by: JunHowie <JunHowie@aliyun.com>
Co-authored-by: JunHowie <JunHowie@aliyun.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-08-12 02:03:53 -07:00
bc8372efc3 [Bugfix] Fix erroneous randomly generated cases in bad word testing (#22170)
Signed-off-by: phantomlei <phantomlei3@gmail.com>
2025-08-12 02:03:22 -07:00
8d17fa633e [V0] Correct CUDA Graph capture for encoder-decoder models (#22630) 2025-08-12 02:01:08 -07:00
9f909b8996 [New Model] Support Command-A-Vision (#22660)
Signed-off-by: donglu <donglu@cohere.com>
2025-08-12 01:39:54 -07:00
59f3b93636 [DOC] update v1_guide with INTEL HW (#22679)
Signed-off-by: Chendi.Xue <chendi.xue@intel.com>
2025-08-12 01:22:49 -07:00
78077d5417 Move SchedulerConfig from config/__init__.py to config/scheduler.py (#22626)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-12 00:23:49 -07:00
6d729c43fb [Bugfix] Fix ModernBert load & Enable sliding window attention for bidirectional attention. (#22637)
Signed-off-by: wang.yuqi <noooop@126.com>
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
Co-authored-by: Max de Bayser <mbayser@br.ibm.com>
2025-08-12 00:23:17 -07:00
2f4657952b [doc] Update x86 CPU-inference installation doc to reflect optionality of AVX512f (#22707)
Signed-off-by: Sooraj S <94284954+sooraj-satheesh@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Li, Jiang <bigpyj64@gmail.com>
2025-08-12 00:21:08 -07:00
3a7e3bbdd2 [Doc] Added unmentioned required option "method" in the usage of EAGLE-3 based models (#21737)
Signed-off-by: Dilute-l <dilu2333@163.com>
Co-authored-by: Dilute-l <dilu2333@163.com>
2025-08-12 00:14:51 -07:00
4fbd8bb597 Fix passing SpeculativeConfig from the CLI (#22652)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-11 22:13:32 -07:00
ad344ef552 [gpt-oss] Small bug fixes for frontend (#22512)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-08-11 22:04:38 -07:00
bbaf9e9cb1 [gpt-oss] Fix mxfp4 support (#22700)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-08-11 21:22:26 -07:00
4678503476 Migrate MiniCPMVImageInputs to TensorSchema (#21939)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-08-11 20:43:37 -07:00
93d0652433 [CI] Increase timeout for test_completion_with_image_embeds (#22670)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-11 20:31:36 -07:00
ea1292ad3e [CI Failure] Use float32 for tests/entrypoints/openai/test_audio.py (#22686)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-11 20:20:42 -07:00
dc5e4a653c Upgrade FlashInfer to v0.2.11 (#22613)
Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-08-11 19:58:41 -07:00
839ab00349 Re-enable Xet on TPU tests now that hf_xet has been updated (#22666)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-11 19:54:40 -07:00
9b94d6ec8f Enable 4bit bnb prequant MOE (#21548)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-11 19:02:14 -07:00
1891a265d3 [gpt-oss] Add test for response API + harmony (but skipped) (#22554)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-08-11 17:47:24 -07:00
95a935fc48 [gpt-oss] Support streaming in response API (#22431)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-08-11 17:46:59 -07:00
458e74eb90 Support more parallel styles in Transformers backend TP (#22651)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-11 10:42:48 -07:00
65abe111a3 [CI] Skip Tree Attn Test in test_max_len.py to unblock CI (#22664)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-08-11 10:36:05 -07:00
807d21b80d [BugFix] [Spec Decode] Remove LlamaForCausalLMEagle3 to fix CI (#22611)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-08-11 10:31:36 -07:00
c90fb03df5 [CI/Build] Skip Mllama HF runner tests with Transformers v4.55.0 (#22659)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-08-11 10:00:58 -07:00
84cf78acee [Model] Pooling models default to using chunked prefill & prefix caching if supported. (#20930)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-08-11 09:41:37 -07:00
16fb668b61 fix: NIXL connector transfers partial block to pass full multi-modal context (#21074)
Signed-off-by: GuanLuo <gluo@nvidia.com>
2025-08-11 09:40:55 -07:00
f7dcce7a4a [Feature] Add VLLM_USE_DEEP_GEMM_E8M0 Env to Control E8M0 Scale (#21968)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-11 09:39:08 -07:00
8e13d9fe6d [Misc] Further clean up some redundant config definitions (#22649)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-08-11 09:22:25 -07:00
3fa5b25845 Document aarch64 CPU support works (#22646)
Signed-off-by: Eric Curtin <ecurtin@redhat.com>
2025-08-11 07:22:45 -07:00
14a5d903ab [Model] NemotronH Support (#22349)
Signed-off-by: Daniel Afrimi <danielafrimi8@gmail.com>
2025-08-11 04:09:24 -07:00
951b038298 [Misc] Move jsontree to utils (#22622)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-11 03:49:32 -07:00
ebf7605b0d [Misc] Move tensor schema tests (#22612)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-11 00:15:27 -07:00
bc1d02ac85 [Docs] Add comprehensive CLI reference for all large vllm subcommands (#22601)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-11 00:13:33 -07:00
1e55dfa7e5 [BUGFIX] KeyError 'layers.14.mlp.gate.g_idx' for Qwen3-MoE with GPTQ on ROCm (#22017) 2025-08-11 00:13:30 -07:00
384a052971 [Misc] benchmark_moe supports expert parallel (#22251)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-11 00:13:27 -07:00
39052dbca8 Support token_type_ids in V1 with less code changes (#21985)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
2025-08-10 22:54:59 -07:00
9c97a1c349 [ROCm][AITER] Support AITER Rope ops in RotaryEmbedding Module. (#22521)
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
2025-08-10 22:52:34 -07:00
f919d4cb8f [BugFix] Fix logits repetition penalty cuda check (#22592) 2025-08-10 22:52:31 -07:00
afa5b7ca0b [Misc][gpt-oss] guard import when triton kernel when not up to date (#22584)
Signed-off-by: zhewenli <zhewenli@meta.com>
2025-08-10 21:29:35 -07:00
1b99028069 [Misc][gpt-oss] Add rules to label gpt-oss related PRs (#22600)
Signed-off-by: Lifan Shen <lifans@meta.com>
2025-08-10 19:49:51 -07:00
5898b135ab [BugFix] Fix KVConnectorOutput TPU breakage (#22598)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-10 19:33:48 -07:00
b799f4b9ea [CI/Build] Fix tensorizer test for load_format change (#22583)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-08-10 19:30:00 -07:00
06da44f0cb Migrate LlavaImageInputs to TensorSchema (#21770)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-08-10 19:29:19 -07:00
a554991748 Migrate LlavaNextVideoPixelInputs to TensorSchema (#21843)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-08-10 19:29:16 -07:00
d1af8b7be9 enable Docker-aware precompiled wheel setup (#22106)
Signed-off-by: dougbtv <dosmith@redhat.com>
2025-08-10 16:29:02 -07:00
68b254d673 Fix TensorSchema validation test for symbolic dims (#22366)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-08-10 17:16:44 +00:00
8c50d62f5a Remove redundant row_indices unsqueeze operation in MiniCPMO (#22528)
Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
2025-08-10 09:20:00 -07:00
b4e2916721 Migrate LlavaNextImageInputs to TensorSchema (#21774)
Signed-off-by: Benji Beck <benjibeck@meta.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-08-10 09:05:21 -07:00
65a7917be4 Fix(benchmarks): allow multiple mm contents in OpenAI Chat Completion Benchmarks (#22534)
Signed-off-by: breno.skuk <breno.skuk@hcompany.ai>
2025-08-10 09:03:15 -07:00
b76753f0b5 [Bugfix][Kernel] Support partial rotary embedding for MRoPE triton kernel (#22593)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-10 09:00:36 -07:00
b81fe83b2c [doc] add alibaba cloud as sponsor (#22597)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-08-10 23:13:47 +08:00
0757551c96 [doc] add beijing meetup links (#22596)
Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-08-10 22:51:36 +08:00
8290d15d2c Move CacheConfig from config/__init__.py to config/cache.py (#22586)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-10 07:36:40 -07:00
049c245143 [Misc] Replace flaky image urls in pixtral test (#22574)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-08-10 06:18:21 -07:00
00976db0c3 [Docs] Fix warnings in docs build (#22588)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-10 05:49:51 -07:00
d411df0296 [Misc] Further refine type annotations in parallel state (#22499)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-10 05:49:48 -07:00
010e0e39ea [Doc] Fix API doc link in side navigation (#22585)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-08-10 01:35:22 -07:00
326976291b [Misc] code clean duplicate set_current_vllm_config in _set_vllm_config (#22566)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-08-10 00:08:48 -07:00
7e8d685775 [Minor] Fix pre-commit error on main (#22579)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-08-10 00:08:23 -07:00
c49848396d Refactor sliding window configuration to Transformers best practice (#21927)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-09 20:50:48 -07:00
2a84fb422f [TPU] kv cache update kernel doesn't need to be padded slices to multiple of num_slices_per_block (#22394)
Signed-off-by: Chengji Yao <chengjiyao@gmail.com>
Co-authored-by: Chengji Yao <chengjiyao@gmail.com>
2025-08-09 20:49:04 -07:00
534c45b962 Improve fast_topk function with type hints and documentation (#22530)
Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
2025-08-09 20:25:42 -07:00
3d7363e61c [Config] add "qwen" as a native eagle3 target supported model (#22333)
Signed-off-by: lechen <lecself@163.com>
Signed-off-by: LeChen <lecself@163.com>
2025-08-09 20:21:05 -07:00
0c5254b82a [oss] Init gpt-oss bf16 support (#22508)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-09 20:19:13 -07:00
61f67d8acd [V1] [Hybrid] Enable Full CUDA Graph (decode-only) for Mamba layers (#21401)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-08-09 20:16:11 -07:00
42172ad18f [FEAT] [Performance] Add triton mrope to replace the torch code path (#22375)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-08-09 11:50:03 -07:00
fbd8595c5c [Bugfix] Fix basic models tests hanging due to mm processor creation (#22571)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-09 11:42:21 -07:00
5a16fa614c [Model] Gemma3n MM (#20495)
Signed-off-by: ShriKode <shrikode@gmail.com>
Signed-off-by: NickLucche <nlucches@redhat.com>
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: ShriKode <shrikode@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
2025-08-09 09:56:25 -07:00
2d18256e47 Move ParallelConfig from config/__init__.py to config/parallel.py (#22565)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-09 08:33:46 -07:00
56186474f6 [Docs] Reduce noise in docs and --help from the JSON tip (#22567)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-09 08:31:32 -07:00
1bf5e1f25b [CI] [Hybrid] Speed up hybrid models test by removing large models (#22563)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-08-09 02:04:42 -07:00
a6022e6fbc GLM-4.5V with new class name at transformers (#22520)
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-09 00:50:21 -07:00
2be07a0db1 Update docs for Minimax-Text support (#22562)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-08-09 00:18:18 -07:00
0edc0cd52b [Bugfix] Fix CI moe kernel failure (#22556)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-09 00:03:29 -07:00
7920e9b1c5 [Bugfix] Fix failing GPT-OSS initialization test (#22557)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-09 00:03:26 -07:00
b7c0942b65 [ROCm][Misc] Rename the context_len to seq_len in ROCm custom paged attention kernel (#22097)
Signed-off-by: charlifu <charlifu@amd.com>
2025-08-08 23:15:06 -07:00
9a0c5ded5a [TPU] Add support for online w8a8 quantization (#22425)
Signed-off-by: Kyuyeun Kim <kyuyeunk@google.com>
2025-08-08 23:12:54 -07:00
10a02535d4 Fix loading of quantized BigCode models (#22463)
Signed-off-by: Eldar Kurtic <eldar@neuralmagic.com>
2025-08-08 23:12:12 -07:00
65552b476b [Misc] Use config definitions from Transformers library (#21913)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-08 23:10:51 -07:00
7ad7adb67f v1: Pass KVConnectorOutput to scheduler-side (#22157)
Signed-off-by: Or Ozeri <oro@il.ibm.com>
2025-08-08 23:09:51 -07:00
6ade99eafa [V1] [Hybrid] Support Minimax-Text-01 in V1 (#22151)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-08-08 23:08:48 -07:00
3157aebb63 [Log] Add Warning for Deprecation of DeepGEMM old version (#22194)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-08 23:07:48 -07:00
8a0ffd6285 Remove mamba_ssm from vLLM requirements; install inside test container using --no-build-isolation (#22541)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-08-08 23:05:32 -07:00
23472ff51c [Doc] Add usage of implicit text-only mode (#22561)
Signed-off-by: Roger Wang <hey@rogerw.me>
Co-authored-by: Flora Feng <4florafeng@gmail.com>
2025-08-08 23:04:19 -07:00
08b751ba74 Implicit language-model-only mode via limit-mm-per-prompt (#22299)
Signed-off-by: Roger Wang <hey@rogerw.me>
Signed-off-by: Andy Xie <andy.xning@gmail.com>
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
Signed-off-by: Shu Wang <shuw@nvidia.com>
Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
Signed-off-by: Shu Wang. <shuw@nvidia.com>
Signed-off-by: XIn Li <xinli@nvidia.com>
Signed-off-by: Junhao Li <junhao@ubicloud.com>
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
Signed-off-by: zitian zhao <zitian.zhao@tencentmusic.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: iAmir97 <Amir.balwel@embeddedllm.com>
Signed-off-by: iAmir97 <71513472+iAmir97@users.noreply.github.com>
Signed-off-by: Linkun <github@lkchen.net>
Co-authored-by: Ning Xie <andy.xning@gmail.com>
Co-authored-by: TJian <tunjian.tan@embeddedllm.com>
Co-authored-by: Andrew Sansom <andrew@protopia.ai>
Co-authored-by: Zhiyu <zhiyuc@nvidia.com>
Co-authored-by: Shu Wang <shuw@nvidia.com>
Co-authored-by: XIn Li <xinli@nvidia.com>
Co-authored-by: Junhao Li <streaver91@gmail.com>
Co-authored-by: Chauncey <chaunceyjiang@gmail.com>
Co-authored-by: Yuxuan Zhang <2448370773@qq.com>
Co-authored-by: ZiTian Zhao <zitian.zhao@tencentmusic.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Po-Han Huang (NVIDIA) <53919306+nvpohanh@users.noreply.github.com>
Co-authored-by: iAmir97 <71513472+iAmir97@users.noreply.github.com>
Co-authored-by: iAmir97 <Amir.balwel@embeddedllm.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Hong Hanh <hanh.usth@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
Co-authored-by: lkchen <github@lkchen.net>
2025-08-08 22:21:40 -07:00
429e4e2d42 [Bugfix] Fix ModernBert cuda graph capturing in v1 (#21901)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-08-08 22:17:22 -07:00
35afe1b30b [BugFix] [P/D] Handle lookahead token count edge-case with Eagle Spec Decoding and P/D (#22317)
Signed-off-by: Pradyun Ramadorai <pradyunr@amazon.com>
Signed-off-by: Pradyun92 <142861237+Pradyun92@users.noreply.github.com>
Co-authored-by: Pradyun Ramadorai <pradyunr@amazon.com>
Co-authored-by: Nicolò Lucchesi <nicolo.lucchesi@gmail.com>
2025-08-08 17:04:15 -07:00
81c57f60a2 [XPU] upgrade torch 2.8 on for XPU (#22300)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-08-08 17:03:45 -07:00
311d875614 Drop flaky test_healthcheck_response_time (#22539)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
2025-08-08 16:56:47 -07:00
e3edc0a7a8 Extract CompilationConfig from config.py (#22524)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-08 16:34:25 -07:00
baece8c3d2 [Frontend] Add unix domain socket support (#18097)
Signed-off-by: <yyweiss@gmail.com>
Signed-off-by: yyw <yyweiss@gmail.com>
2025-08-08 16:23:44 -07:00
2fcf6b27b6 [Docs] fix broken links in metrics.md (#22315)
Signed-off-by: Guy Stone <guys@spotify.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-08 16:22:35 -07:00
41b9655751 Skip Qwen 1 in CI because remote code is no longer compatible with Transformers (#22536)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-08 16:20:58 -07:00
bd875d2eb7 [Bugfix] Update FA commit hash (#22546)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-08-08 16:10:25 -07:00
f703b923f3 [Misc] DeepGEMM : Avoid JIT generation in the hot-path (#22215)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-08-08 16:09:59 -07:00
cd9b9de1fb [BugFix] Fix IMA FlashMLA full cuda-graph and DP + Update FlashMLA (#21691)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
Co-authored-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com>
2025-08-08 16:09:42 -07:00
fe6d8257a1 [gpt-oss] Support tool call and implement MCP tool server (#22427)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-08-08 15:06:37 -07:00
e290594072 [Docs] Rename “Distributed inference and serving” to “Parallelism & Scaling” (#22466)
Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
2025-08-08 19:26:21 +00:00
f756a682d9 [gpt-oss] guard import when triton kernel is not installed (#22529)
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-08 11:18:33 -07:00
f0964e29cb [Benchmark] Add benchmark tool for multi turn conversations (#20267) 2025-08-08 10:28:50 -07:00
e789cad6b8 [gpt-oss] triton kernel mxfp4 (#22421)
Signed-off-by: <zyy1102000@gmail.com>
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
2025-08-08 08:24:07 -07:00
e5ebeeba53 Remove exception for Python 3.8 typing from linter (#22506)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-08 03:06:46 -07:00
7be7f3824a [Docs] Improve API docs (+small tweaks) (#22459)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-08 03:02:51 -07:00
ccdae737a0 [BugFix] Don't cancel asyncio tasks directly from destructors (#22476)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-08 01:13:18 -07:00
904063907c [Misc] fix openai version (#22485)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-08-08 01:12:54 -07:00
43c4f3d77c [Misc] Begin deprecation of get_tensor_model_*_group (#22494)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-08 01:11:54 -07:00
1712543df6 [CI/Build] Fix multimodal tests (#22491)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-08 00:31:19 -07:00
808a7b69df [bench] Fix benchmark/serve.py to ignore unavailable results (#22382)
Signed-off-by: Linkun <github@lkchen.net>
2025-08-07 23:15:50 -07:00
099c046463 [Doc] Sleep mode documentation (#22310)
Signed-off-by: iAmir97 <Amir.balwel@embeddedllm.com>
Signed-off-by: iAmir97 <71513472+iAmir97@users.noreply.github.com>
Co-authored-by: iAmir97 <Amir.balwel@embeddedllm.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Hong Hanh <hanh.usth@gmail.com>
Co-authored-by: youkaichao <youkaichao@gmail.com>
2025-08-08 12:25:18 +08:00
af473f0a85 [bugfix] Fix Llama3/4 issues caused by FlashInfer 0.2.10 (#22426)
Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
2025-08-07 20:25:01 -07:00
157f9c1368 Fix pre-commit (#22487)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-07 20:21:54 -07:00
6f287915d8 Optimize MiniCPMO mask creation with vectorized implementation (#22464)
Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
Signed-off-by: zitian zhao <zitian.zhao@tencentmusic.com>
2025-08-07 20:18:50 -07:00
c152e2a8a0 not tie_word_embeddings for glm-4.5 and glm-4.5v (#22460)
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
2025-08-07 19:37:23 -07:00
17eaaef595 [Bugfix] Fix RuntimeError: Index put requires the source and destination dtypes match (#22065)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-08-07 19:20:21 -07:00
3303f134e0 [Kernel] Add support for block FP8 on SM120 (NVIDIA 5090 and RTX PRO 6000) (#22131)
Signed-off-by: Junhao Li <junhao@ubicloud.com>
2025-08-07 19:18:28 -07:00
b2c8ce57c6 Fix Flashinfer CUTLASS MOE Allgather (#21963)
Signed-off-by: Shu Wang <shuw@nvidia.com>
2025-08-07 19:18:25 -07:00
a3b9c17b56 Support Tensorrt-LLM MoE fp4 for low-latency (#21331)
Signed-off-by: Shu Wang <shuw@nvidia.com>
Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
Signed-off-by: Shu Wang. <shuw@nvidia.com>
Signed-off-by: XIn Li <xinli@nvidia.com>
Co-authored-by: XIn Li <xinli@nvidia.com>
2025-08-07 19:18:22 -07:00
d57dc2364e Add ModelOpt Qwen3 nvfp4 support (#20101)
Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
2025-08-07 19:18:19 -07:00
e2c8f1edec [PERF] Use pybase64 to more quickly decode prompt embeddings (#22469)
Signed-off-by: Andrew Sansom <andrew@protopia.ai>
2025-08-07 19:15:32 -07:00
1ee5ead5f8 [ROCm] [V1] [SpecDec] Enable Speculative Decoding on ROCm V1 Engine (#21496)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-08-07 19:13:17 -07:00
acf8aeb79e [Misc] normalize multiprocessing Queue usage (#22371)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-08-08 01:57:27 +00:00
7e3a8dc906 Remove from_dict from SpeculativeConfig (#22451)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-07 10:13:04 -07:00
139d155781 [Frontend] Use engine argument to control MM cache size (#22441)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-07 09:47:10 -07:00
8c9da6be22 [Core] Simplify mm processing cache (#22457)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-07 09:47:07 -07:00
399d2a10e2 Fix pre-commit error in main (#22462)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-07 08:54:39 -07:00
4815b00f54 [gpt-oss] Generate ResponseOutputItem from Harmony Message (#22410)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-08-07 08:33:25 -07:00
4da8bf20d0 [Tool] Fix auto tool call (#22434)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-08-07 07:03:38 -07:00
7e0b121812 [Bugfix] Add missing packed_modules_mapping to DeepseekV2ForCausalLM (#22352)
Signed-off-by: Felix Marty <Felix.Marty@amd.com>
2025-08-07 06:30:48 -07:00
766bc8162c [Core] Store only the keys for multi-modal data in P0 (#22198)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-07 01:45:04 -07:00
289b18e670 [Docs] Update features/disagg_prefill, add v1 examples and development (#22165)
Signed-off-by: David Chen <530634352@qq.com>
2025-08-07 00:59:23 -07:00
35171b1172 [Doc] update docs for nightly benchmarks (#12022)
Signed-off-by: Andrew Chan <andrewkchan.akc@gmail.com>
2025-08-07 00:29:45 -07:00
a2c6696bfe [Docs] Factor out troubleshooting to its own guide; add section for Ray Observability (#21578)
Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
2025-08-07 00:29:13 -07:00
5e8398805e [Doc] Fix link to prefix caching design (#22384)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-08-07 00:28:15 -07:00
136825de75 [Misc] Enhance code formatting in mxfp4.py (#22423)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-07 00:26:24 -07:00
c2dba2dba8 Add H20-3e fused MoE kernel tuning configs for GLM-4.5 (#22433)
Signed-off-by: shaojunqi <shaojunqi.sjq@alibaba-inc.com>
Co-authored-by: shaojunqi <shaojunqi.sjq@alibaba-inc.com>
2025-08-07 00:24:47 -07:00
434d2f3f7a [Docs] Add missing dependency for docs build (#22435)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-07 00:22:07 -07:00
8e8e0b6af1 feat: Add --enable-log-outputs flag for logging model generations (#20707)
Signed-off-by: Adrian Garcia <adrian.garcia@inceptionai.ai>
2025-08-06 23:10:13 -07:00
82216dc21f [Misc] Support routing logic simulation (#21990)
Signed-off-by: Ming Yang <minos.future@gmail.com>
Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-08-06 23:06:20 -07:00
370661856b [Frontend] Update OpenAI error response to upstream format (#22099)
Signed-off-by: Moritz Sanft <58110325+msanft@users.noreply.github.com>
2025-08-06 23:06:00 -07:00
cbc8457b26 [Model] Switch to Fused RMS norm in Qwen2.5_VL model. (#22184)
Signed-off-by: kf <kuanfu.liu@embeddedllm.com>
Signed-off-by: tjtanaavllm <tunjian.tan@amd.com>
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: kf <kuanfu.liu@embeddedllm.com>
2025-08-06 23:05:24 -07:00
4d4297e8fe [Bench] Split serve.py:main into async/async versions (#22405)
Signed-off-by: Linkun <github@lkchen.net>
2025-08-06 23:05:07 -07:00
2a4c825523 [CI] Skip the pooling models that do not support transformers v4.55 (#22411)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-08-06 23:05:03 -07:00
4be02a3776 [Bugfix] EPLB load statistics problem (#22167)
Signed-off-by: ycyaw66 <497410282@qq.com>
Signed-off-by: David Chen <530634352@qq.com>
Co-authored-by: ycyaw66 <497410282@qq.com>
2025-08-07 04:07:54 +00:00
f6278b6243 [gpt-oss] Convert user input to harmony format (#22402)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-06 20:56:02 -07:00
ad6c655dde preload heavy modules when mp method is forkserver (#22214)
Signed-off-by: Lionel Villard <villard@us.ibm.com>
2025-08-06 20:33:24 -07:00
14bcf93a6a Optimize logger init performance by using module-level constants (#22373)
Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
2025-08-06 20:32:19 -07:00
ecbea55ca2 Update hf_xet pin to resolve hangs (#22356)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-06 20:31:41 -07:00
609b533cb6 [Bugfix] Add proper comparison for package versions (#22314)
Signed-off-by: Syed Muhammad Bin Asif <syedmba7@connect.hku.hk>
2025-08-06 20:31:03 -07:00
5e9455ae8f [Bugfix]: Fix the streaming output for function calls in the minimax (#22015)
Signed-off-by: QscQ <qscqesze@gmail.com>
Signed-off-by: qingjun <qingjun@minimaxi.com>
2025-08-06 20:30:27 -07:00
a00d8b236f Use float32 for test_completion.py (#22385)
Signed-off-by: Michael Goin <mgoin64@gmail.com>
2025-08-07 11:07:47 +08:00
04cf435d95 [Bugfix] Fix wrong method name in Intern-S1 image processor (#22417)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-06 20:05:20 -07:00
7377131a2c [Qwen3] Enable dual-chunk-attention support for Qwen3 models. (#21924)
Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
2025-08-06 19:58:08 -07:00
6b47ef24de [XPU]Fix flash_attn_varlen_func interface on xpu (#22350)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-08-06 19:28:11 -07:00
1dc8a70b6d [Attention] Support multiple attention metadata builders per kv_cache_spec + proper local attention no hybrid kv cache fix (#21588)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-08-06 18:40:52 -07:00
f825c6bd22 Support encoder_only attention for FlexAttention (#22273)
Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
2025-08-06 18:37:14 -07:00
41b67f4263 [model] Support MiniCPM-V 4.0 (#22166)
Co-authored-by: imning3 <hbning@pku.edu.cn>
2025-08-06 18:35:46 -07:00
e8961e963a Update flashinfer-python==0.2.10 (#22389)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-06 18:10:24 -07:00
9a3835aaa9 Fix trtllm-gen attention env and add attention sink (#22378)
Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
Signed-off-by: Lain <fusiyuan2000@hotmail.com>
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Michael Goin <mgoin64@gmail.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
2025-08-06 18:07:41 -07:00
5c7cc33f4d [gpt-oss] fix model config with hf_config (#22401)
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
2025-08-06 18:04:04 -07:00
19c9365aa4 [gpt-oss] add demo tool server (#22393)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-08-06 17:47:14 -07:00
eec890c1c1 [Bug] Fix B200 DeepGEMM E8M0 Accuracy Issue (#22399)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-06 17:03:53 -07:00
46a13949d5 [v1] - Mamba1 Attention Metadata (#21249)
Signed-off-by: asafg <asafg@ai21.com>
Co-authored-by: asafg <asafg@ai21.com>
2025-08-06 17:03:42 -07:00
31f09c615f [gpt-oss] flashinfer mxfp4 (#22339)
Signed-off-by: simon-mo <xmo@berkeley.edu>
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
2025-08-06 12:37:27 -07:00
31f5dc5b2a [gpt-oss] Enhance error msg on attention sink init (#22335)
Signed-off-by: simon-mo <xmo@berkeley.edu>
Signed-off-by: Yongye Zhu <zyy1102000@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
2025-08-06 11:41:42 -07:00
ec7cb19224 [gpt-oss] Add loop for built-in tool call (#22374)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
2025-08-06 10:32:21 -07:00
2435ea7ed5 [Bugfix] Make condition in triton kernel constexpr (#22370)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-08-06 10:00:58 -07:00
4a6b72c2ab [BugFix] Fix triton compile error in kernel_unified_attention_2/3d caused by attention sinks (#22368)
Signed-off-by: LucasWilkinson <lwilkinson@neuralmagic.com>
2025-08-06 09:47:38 -07:00
b4b9813b5e add the codes to check AMD Instinct GPU number (#22367)
Signed-off-by: Zhang Jason <ning.zhang2@amd.com>
2025-08-06 08:58:38 -07:00
2cb6ef8996 [BugFix] Fix FA2 RuntimeError when sinks is provided (#22365)
Signed-off-by: LucasWilkinson <lwilkinson@neuralmagic.com>
2025-08-06 08:03:03 -07:00
9edd1db02b [Minor] Fix type (#22347)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-06 02:22:03 -07:00
f263a4b53f [gpt-oss] Support chat completion api (#22342) 2025-08-06 01:57:39 -07:00
54991c548a [gpt-oss] add model to supported models doc (#22336)
Signed-off-by: Roger Wang <hey@rogerw.me>
2025-08-06 01:49:44 -07:00
178d03fbd6 [gpt-oss] Add Tool/ConversationContext classes and harmony_utils (#22340)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
2025-08-06 01:08:49 -07:00
fa00c5d75b [Misc] Clean up duplicated hf overrides (#22311)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-08-06 07:50:25 +00:00
134a8ee8fd [gpt-oss] Add openai-harmony as default dependency (#22332)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
2025-08-06 00:10:14 -07:00
90ec006937 [gpt-oss] flashinfer attention sink init (#22330)
Signed-off-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com>
2025-08-05 23:48:19 -07:00
a47e6ffe93 [GptOss] Add GptOss reasoning parser to support structure output (#22322)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
2025-08-05 23:39:13 -07:00
98a3a81024 [ROCm] Add attention sink to use_rocm_custom_paged_attention (#22329)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>

Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
2025-08-05 23:30:38 -07:00
de98252f49 Add GPT-OSS model code and config [1/N] (#22327)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-05 23:26:00 -07:00
796bae07c5 Update transformers to v4.55 (#21931)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Isotr0py <2037008807@qq.com>
Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Co-authored-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-05 22:56:14 -07:00
6e20924350 Add attention sink in attention backends (#22320)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>

Co-authored-by: LiuXiaoxuanPKU <lilyliupku@gmail.com>
Co-authored-by: simon-mo <xmo@berkeley.edu>
Co-authored-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Co-authored-by: Minseok Lee <47620120+minseokl@users.noreply.github.com>
Co-authored-by: Yongye Zhu <zyy1102000@gmail.com>
2025-08-05 22:37:21 -07:00
dd16bdc798 Increase openai-python version (#22316)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-05 21:43:21 -07:00
e3c876dca3 Upgrade FA3 for attention sink (#22313)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-05 21:36:21 -07:00
5d5d419ca6 [Bugfix][CI/Build][ROCm] Make sure to use the headers from the build folder on ROCm (#22264)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-08-05 20:39:32 -07:00
302962e806 [Bugfix] Skip dead and non-GPU nodes for Ray DP engine allocation (#22275)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-08-05 20:35:32 -07:00
7e6544c797 [Perf] Parallelize fill_bitmask to accelerate high-throughput guided decoding (#21862)
Signed-off-by: Benjamin Chislett <benjamin.chislett@centml.ai>
2025-08-05 19:57:49 -07:00
8e6c7e873f [Bugfix] Fix MoE BNB version (#22260)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-05 19:56:22 -07:00
6a51530437 [Bugfix] Fix 3D input passed into cutlass_scaled_mm (#22278)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-06 10:35:20 +08:00
35509fc5be [Bugfix] Remove faulty test for oot attention backend (#22286)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-06 00:05:40 +00:00
4b29d2784b [CI][TPU] Fix docker clean up (#22271)
Signed-off-by: Siyuan Liu <lsiyuan@google.com>
2025-08-05 23:54:56 +00:00
59a0b8554b [bugfix] fix blackwell deepep installation (#22255) 2025-08-06 01:26:09 +08:00
469b3ffaaa [V1] port xformers backend to v1 (#21342)
Signed-off-by: Giancarlo Delfin <gdelfin@meta.com>
2025-08-05 10:04:46 -07:00
ae87ddd040 [Refactor] Remove Unused Environment Variable VLLM_NO_DEPRECATION_WARNING (#22199)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-05 09:40:23 -07:00
a7cb6101ca [CI/Build] Update flashinfer to 0.2.9 (#22233)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-05 09:39:38 -07:00
c494f96fbc Use UV_LINK_MODE=copy in Dockerfile to avoid hardlink fail (#22128)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-05 06:57:10 -07:00
0c275ad5ad [V0 Deprecation][TPU] Remove V1 flag check from tests (#22248)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-08-05 06:53:23 -07:00
74333ae2f6 [Misc] correct static type check for GroupCoordinator (#21946)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-08-05 03:17:46 -07:00
83156c7b89 [NVIDIA] Support Flashinfer TRT-LLM Prefill Attention Kernel (#22095)
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
2025-08-05 02:45:34 -07:00
4771df7b2b [Feature] Non-contiguous Support for FP8 Quantization (#21961)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-08-05 02:36:43 -07:00
05fae02175 Migrate KimiVLImagePixelInputs to TensorSchema (#21769)
Signed-off-by: Benji Beck <benjibeck@meta.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
2025-08-05 02:36:18 -07:00
d1bf1b9711 [Docs][TPU] Highlight TPU Software version selection (#22242)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-08-05 02:33:46 -07:00
586f286789 [Model] Pooling model activation supports per request control by PoolingParams (#20538)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-08-05 00:37:00 -07:00
811ac13d03 [Core] Factor out common logic for MM budget calculation (#22228)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-04 23:54:55 -07:00
e79a12fc3a [UX] Fail if an invalid attention backend is specified (#22217)
Signed-off-by: mgoin <michael@neuralmagic.com>
2025-08-04 23:54:52 -07:00
cdfd6871a5 [Bugfix] Misaligned params in TreeAttentionImpl (#22226)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-04 22:40:09 -07:00
4b3e4474d7 Optimize configuration access with LRU cache in custom ops (#22204)
Signed-off-by: zitian zhao <zitian.zhao@tencentmusic.com>
2025-08-04 21:43:24 -07:00
bd3db7f469 [Misc] log more detailed message for ensure_model_parallel_initialized (#22144)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-08-04 19:36:55 -07:00
29b97c0995 [Doc] add backend to doc string of initialize_model_parallel (#22142)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-08-04 19:36:20 -07:00
7b455cf1c0 [Misc] Remove pass_config from CompilationConfig dump_json excluded (#21911)
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
2025-08-04 19:17:18 -07:00
8a6e108e76 fix: kimi_k2 return empty tool call list (#22149)
Signed-off-by: tlipoca9 <tlipoca9@gmail.com>
2025-08-04 19:15:31 -07:00
d7b28f3415 [Log] DeepGEMM Update Log for Unaligned Problem Size (#22208)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-04 19:13:19 -07:00
6fa41e0c32 self.gate dtype update for GLM-4.5 (#22203)
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
2025-08-04 19:12:38 -07:00
031ca762d7 [ROCm][Bugfix] Compilation passes fix (#22202)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-08-04 19:12:28 -07:00
6ad6b8e115 [FEAT] Refactor ROPE into module (#22192)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-08-04 19:12:16 -07:00
f4f4e7ef27 [V0 deprecation][P/D] Deprecate v0 KVConnectorBase code (1/2) (#21785)
Signed-off-by: Linkun Chen <github@lkchen.net>
2025-08-04 19:11:33 -07:00
5ea71ff46f [V1] reduce block size for tree attention correctness test to fix 'ou… (#22207)
Signed-off-by: Giancarlo Delfin <gdelfin@meta.com>
2025-08-04 19:11:06 -07:00
7175817637 Revert "[Bugfix] V1 Fix the cursor leakage issue during request scheduling." (#22223) 2025-08-04 18:37:06 -07:00
2dffac464c [Bugfix] V1 Fix the cursor leakage issue during request scheduling. (#21173)
Signed-off-by: CLFutureX <775523362@qq.com>
2025-08-04 18:34:10 -07:00
bdcb42e45d [NVIDIA] Auto detect modelopt quant and fix DSR1-FP4 weight loading (#22073) 2025-08-04 21:02:55 -04:00
c09efff976 [Bugfix][V1][P/D]Fix the uneven polling issue in the toy proxy for P2pNcclConnector (#21819)
Signed-off-by: Abatom <abzhonghua@gmail.com>
2025-08-04 20:17:05 +00:00
309c1bb822 [Bug] Update auto_tune.sh to separate benchmarking and profiling. (#21629)
Signed-off-by: Eric Hanley <ericehanley@google.com>
2025-08-04 15:12:06 +00:00
9af654cc38 [Responses API] Ignore store=True and process the request by default (#22185)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-04 05:12:48 -07:00
a5fff3bd49 Fix Arcee model weight loading: Add custom load_weights (#21725)
Signed-off-by: alyosha-swamy <raghav@arcee.ai>
2025-08-04 04:09:56 -07:00
1539ced93a [Doc] Update pooling model docs (#22186)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-04 03:37:06 -07:00
54de71d0df [Sampler] Support returning all logprobs or logits (#21792)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-08-04 03:04:12 -07:00
fed5849d3f [Bugfix] Fix failing GGUF models test (#22174)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-04 01:27:02 -07:00
c1b4eb048a [feat] move WEIGHT_SCALE_SUPPORTED into raise block to accelerate RLHF weight loading (#21164)
Signed-off-by: huangweixiao <huangweixiao@msh.team>
2025-08-04 15:43:06 +08:00
a7b8788d2c [Misc] Modify the organization of GLM series (#22171)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-03 23:51:20 -07:00
8ecb3e9e93 [CI Bugfix] Fix wNa16 kernel not found for test_shared_storage_connector_hashes (#22163)
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-08-03 22:19:04 -07:00
e5949e5ae0 Remove index_put from MM embeddings merging (#22105)
Co-authored-by: Chenxi Yang <cxyang@meta.com>
2025-08-03 22:15:14 -07:00
49bcd893e7 [refactor] improve ConstantList exception specificity (#22156)
Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
2025-08-03 22:14:49 -07:00
aa7012eb6d Add tree attention backend for v1 (part 1) (#20401)
Signed-off-by: Giancarlo Delfin <gdelfin@meta.com>
2025-08-03 22:13:26 -07:00
c2e75b3c11 remove duplicate code within cleanup_dist_env_and_memory (#22147)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-08-03 20:03:58 -07:00
0d7db16a92 [PD] add test for chat completions endpoint (#21925)
Signed-off-by: Abirdcfly <fp544037857@gmail.com>
2025-08-03 19:57:03 -07:00
845420ac2c [RLHF] Fix torch.dtype not serializable in example (#22158)
Signed-off-by: 22quinn <33176974+22quinn@users.noreply.github.com>
2025-08-04 02:43:33 +00:00
e27d25a0dc [fix] fix correct assertion syntax error in attention utils. (#22154)
Signed-off-by: zitian.zhao <zitian.zhao@tencentmusic.com>
2025-08-03 19:24:02 -07:00
6f5478298d Use aiohttp connection pool for benchmarking (#21981)
Signed-off-by: Seiji Eicher <seiji@anyscale.com>
2025-08-03 19:23:32 -07:00
6a39ba85fe [Bugfix] Fix failing multimodal standard test (#22153)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-08-03 19:04:38 +00:00
d3c18c9cb0 fuse fp32 for GLM-4.5 e_score_correction_bias (#22143)
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
2025-08-03 09:04:54 -07:00
83f7bbb318 Add chat doc in quick start (#21213)
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
2025-08-03 07:47:55 -07:00
b5dfb94fa0 [CI/Build][Bugfix] Fix Qwen2.5 tests in CPU CI via fallback silu_and_mul to torch native implementation (#22145)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-08-03 05:34:04 -07:00
6d98843b31 [Responses API] Disable response store by default (#22137)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-03 04:04:21 -07:00
aefeea0fde [V1] [P/D] Refactor KV Connector Path (#21980)
Signed-off-by: David Ben-David <davidb@pliops.com>
Co-authored-by: David Ben-David <davidb@pliops.com>
2025-08-03 04:03:40 -07:00
H
24d1dffbeb [executor] feat: add supports_pp attr to executors (#21786)
Signed-off-by: Haibin Lin <haibin.lin@bytedance.com>
2025-08-03 18:04:45 +08:00
7de45db9a5 [Misc] update doc comment for send (#22026)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-08-03 00:55:20 -07:00
789562c28c Support CUTLASS NVFP4 (w4a4) for Blackwell Geforce GPUs (SM120) (#21309)
Signed-off-by: LopezCastroRoberto <roberto.lopez.castro@udc.es>
2025-08-03 00:54:22 -07:00
3f36c325fa [Benchmark] Support ready check timeout in vllm bench serve (#21696)
Signed-off-by: Ye (Charlotte) Qi <yeq@meta.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
2025-08-03 00:52:38 -07:00
3dddbf1f25 [Misc] Add tensor schema test coverage for multimodal models (#21754)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-08-03 00:52:14 -07:00
337eb23bcc [Fix] Fix llama4 modelopt weight loading error (#22107)
Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-08-03 00:50:34 -07:00
2ff46b8826 [Misc] Bump ray to 2.48.0 (#22123)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-08-02 19:42:00 -07:00
554df8a6a2 Revert "[compile][startup] Disable C++ compilation of symbolic shapes" (#22122)
Signed-off-by: Xiao Liu <xiszishu@gmail.com>
2025-08-02 09:03:30 -07:00
73e1b9b1d4 [xpu]support moe models on XPU platform (#21643)
Signed-off-by: yan <yan.ma@intel.com>
Signed-off-by: Yan Ma <yan.ma@intel.com>
2025-08-02 07:49:08 -07:00
4abfd8796f [V1] [Hybrid] Validate compatibility of attention backend batch reordering at init time (#21557)
Signed-off-by: Thomas Parnell <tpa@zurich.ibm.com>
2025-08-02 05:29:40 -07:00
f5d0f4784f [Frontend] Improve error message for too many mm items (#22114)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-02 02:20:38 -07:00
b690e34824 [Model] Mamba2 preallocate SSM output tensor to avoid d2d copy overhead (#21075)
Signed-off-by: Chih-Chieh Yang <7364402+cyang49@users.noreply.github.com>
Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
2025-08-02 01:59:34 -07:00
25373b6c6c for glm-4.1V update (#22000)
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com>
Co-authored-by: Isotr0py <2037008807@qq.com>
2025-08-02 01:46:57 -07:00
58eee5f2e0 [PERF] Use faster way of decode in tokenizer: avoid useless list-to-list conversion (#20000)
Signed-off-by: Vadim Gimpelson <vadim.gimpelson@centml.ai>
2025-08-02 01:43:52 -07:00
067c34a155 docs: remove deprecated disable-log-requests flag (#22113)
Signed-off-by: Roger Wang <hey@rogerw.me>
2025-08-02 00:19:48 -07:00
c64861d63c [Bugfix] Mamba2 remove bugged initial state condition in chunk scan (#22034)
Signed-off-by: Chih-Chieh-Yang <7364402+cyang49@users.noreply.github.com>
2025-08-01 23:55:57 -07:00
8564dc9448 Fix test_kv_sharing_fast_prefill flakiness (#22038)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-08-01 23:55:34 -07:00
4ac8437352 [Misc] Getting and passing ray runtime_env to workers (#22040)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-08-01 23:54:40 -07:00
d3a6f2120b [FEAT][ROCm] Enable running Flash Attention as ViT attn backend for Qwen-VL models on ROCm platform. (#22069)
Signed-off-by: tjtanaavllm <tunjian.tan@amd.com>
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: tjtanaavllm <tunjian.tan@amd.com>
2025-08-01 23:53:18 -07:00
0edaf752d7 [Attention][DBO] Add support for "splitting" the CommonAttentionMetadata (#21153)
Signed-off-by: Sage Moore <sage@neuralmagic.com>
2025-08-01 19:47:53 -07:00
6e8d8c4afb [Test] Add Unit Test for Batched DeepGEMM (#21559)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-02 10:45:46 +08:00
8d524ce79f [BugFix] Improve internal DP load balancing (#21617)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-01 19:45:27 -07:00
9f9c38c392 [Speculators][Speculative Decoding] Add Qwen Eagle3 Support (#21835)
Signed-off-by: Dipika Sikka <dipikasikka1@gmail.com>
2025-08-01 19:43:37 -07:00
a65f46be5e [Misc] DeepGemmExperts : Avoid JIT generation in the hot-path (#21955)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-08-01 19:42:03 -07:00
57393715e8 [Misc] VLLM_TARGET_DEVICE.lower() (#22101)
Signed-off-by: NickLucche <nlucches@redhat.com>
2025-08-01 19:41:40 -07:00
ee2eb6ecd8 [Model] Qwen2.5 VL SiLU-and-Mul (#22066)
Signed-off-by: kf <kuanfu.liu@embeddedllm.com>
Signed-off-by: vllmellm <vllm.ellm@embeddedllm.com>
Co-authored-by: kf <kuanfu.liu@embeddedllm.com>
2025-08-01 19:34:37 -07:00
23322431c8 [V1][CUDA] Full cudagraph support for FlashInfer (#21367) 2025-08-01 21:49:34 -04:00
3654847db5 feat: Add Support GPTQ Quantization MOE on ROCM vllm serve (#21733) 2025-08-01 21:12:19 -04:00
eefbf4a68b [Perf] Optimize reshape_and_cache_flash CUDA Kernel (#22036)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-01 19:18:51 -04:00
88faa466d7 [CI] Initial tests for SM100 Blackwell runner (#21877)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-01 16:18:38 -07:00
881e1af43a [BugFix] Harden distributed DP startup (#21538)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-01 21:40:45 +00:00
d84b97a3e3 Add lora test for tp>1 case for TPU. (#21970)
Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
2025-08-01 18:56:08 +00:00
d331759488 Introduce RayPPCommunicator for ray-based PP (#21660)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2025-08-01 11:50:58 -07:00
9659bc7f27 [compile][startup] Disable C++ compilation of symbolic shapes (#20836)
Signed-off-by: Animesh Jain <anijain@umich.edu>
2025-08-01 10:38:52 -07:00
3277e8f9e1 Fix pre-commit failure for SECURTIY.md (#22102)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-08-01 10:36:07 -07:00
8d705996df [Misc] Minor enhancement of benchmark_moe (#22068)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-02 01:35:30 +08:00
38c8bce8b6 Enable headless models for pooling in the Transformers backend (#21767)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-01 10:31:29 -07:00
ac45c44d98 [Bugfix] [Performance] DeepEPHighThroughput + DeepSeek : Quant before Dispatch (#21837)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-08-01 10:14:38 -07:00
d6664664b4 security policy: take 1 (#21119)
Signed-off-by: Huzaifa Sidhpurwala <huzaifas@redhat.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Russell Bryant <rbryant@redhat.com>
2025-08-01 10:09:49 -07:00
b879ecd6e2 [Bugfix] fix when skip tokenizer init (#21922)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-08-01 10:09:36 -07:00
3f8e952179 [Bugfix] Fix glm4.1v video inference issue (#22067)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-08-01 09:33:30 -07:00
326a1b001d Improve documentation of ModelConfig.try_get_generation_config to prevent future confusion (#21526)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-01 09:32:27 -07:00
2d7b09b998 Deprecate --disable-log-requests and replace with --enable-log-requests (#21739)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-01 17:16:37 +01:00
97608dc276 [Docs] use uv in CPU installation docs (#22089)
Signed-off-by: David Xia <david@davidxia.com>
2025-08-01 07:55:55 -07:00
3146519add [BugFix] Don't change title of top-level process (#22032)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-08-01 07:37:55 -07:00
8026a335a1 [BugFix] Update AttnFusionPass cache key (#21947)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2025-08-01 07:11:29 -07:00
a59cd9d9f7 [Refactor] Fix Compile Warning #1444-D (#21462)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-01 06:10:30 -07:00
5c54d9759d [Bugfix][PD] set max_completion_tokens=1 if req has this value (#21841)
Signed-off-by: Abirdcfly <fp544037857@gmail.com>
2025-08-01 06:08:45 -07:00
0a6d305e0f feat(multimodal): Add customizable background color for RGBA to RGB conversion (#22052)
Signed-off-by: Jinheng Li <ahengljh@gmail.com>
Co-authored-by: Jinheng Li <ahengljh@gmail.com>
2025-08-01 06:07:33 -07:00
f81c1bb055 [Bugfix] Check NVIDIA artifactory is accessible before using flashinfer cubin kernels (#21893) 2025-08-01 08:28:45 -04:00
fb0e0d46fc Fix get_kwargs for case where type hint is list[Union[str, type]] (#22016)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-01 05:26:42 -07:00
26b5f7bd2a [BUG] [ROCm] Fix import bug on ROCm (#22083)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com>
2025-08-01 05:25:20 -07:00
dfbc1f8880 [Speculative Decoding] Add speculators config support (#21345) 2025-08-01 08:25:18 -04:00
87c94bc879 Revert "Update sampling_metadata.py (#21937)" (#22088)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-08-01 05:24:46 -07:00
28b18cc741 [Quantization] Enable BNB support for InternS1 (#21953)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-08-01 11:09:54 +00:00
4931486988 [Doc] Added warning of speculating with draft model (#22047)
Signed-off-by: Dilute-l <dilu2333@163.com>
Co-authored-by: Dilute-l <dilu2333@163.com>
2025-08-01 02:11:56 -07:00
0f81b310db [Misc] Remove upper bound in openai package version (#22060)
Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2025-08-01 02:11:40 -07:00
e6680f9e25 [Bugfix] Add log prefix in non-dp mode engine core (#21889)
Signed-off-by: wuhang <wuhang6@huawei.com>
2025-08-01 09:04:16 +00:00
27a145e893 [Doc] Add example for Step3-VL (#22061)
Signed-off-by: Roger Wang <hey@rogerw.me>
2025-08-01 08:35:49 +00:00
da31f6ad3d Revert precompile wheel changes (#22055) 2025-08-01 08:26:24 +00:00
98df153abf [Frontend] Align tool_choice="required" behavior with OpenAI when tools is empty (#21052)
Signed-off-by: Sungyoon Jeong <sungyoon.jeong@furiosa.ai>
2025-08-01 07:54:17 +00:00
e0f63e4a35 [Core] Avoid repeated len(block_token_ids) check in hash_request_tokens (#21781)
Signed-off-by: linzebing <linzebing1995@gmail.com>
2025-08-01 00:23:29 -07:00
b4e081cb15 [Bugfix] Disable multi-modal preprocessor cache for DP (#21896)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-08-01 08:03:56 +01:00
79731a79f0 [Doc] Fix a syntax error of example code in structured_outputs.md (#22045)
Signed-off-by: wangzi <3220100013@zju.edu.cn>
Co-authored-by: wangzi <3220100013@zju.edu.cn>
2025-08-01 00:01:22 -07:00
53d7c39271 Update sampling_metadata.py (#21937)
Signed-off-by: Aviad Rossmann <aviadr@neureality.ai>
2025-07-31 23:23:18 -07:00
61dcc280fa [Doc] Add Voxtral to Supported Models page (#22059)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-31 23:10:56 -07:00
0f46a780d4 [Model] [Quantization] Support quantization for Gemma3n (#21974)
Signed-off-by: Kyle Sayers <kylesayrs@gmail.com>
2025-07-31 22:45:15 -07:00
e1a7fe4af5 [BugFix] fix: aot passes kvcache dtype information (#19750)
Signed-off-by: Mickael Seznec <mickael@mistral.ai>
2025-08-01 05:45:02 +00:00
82de9b9d46 [Misc] Automatically resolve HF processor init kwargs (#22005)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-31 22:44:10 -07:00
ad57f23f6a [Bugfix] Fix: Fix multi loras with tp >=2 and LRU cache (#20873)
Signed-off-by: charent <19562666+charent@users.noreply.github.com>
2025-07-31 19:48:13 -07:00
3700642013 [Refactor] Remove Duplicate per_block_cast_to_fp8, Remove Dependencies of DeepGEMM (#21787)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-08-01 01:13:27 +00:00
0bd409cf01 Move flashinfer-python to optional extra vllm[flashinfer] (#21959)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-31 18:02:11 -07:00
e360316ab9 Add DeepGEMM to Dockerfile in vllm-base image (#21533)
Signed-off-by: Matthew Bonanni <mbonanni001@gmail.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-07-31 18:01:55 -07:00
c3e0e9337e [Feature] Add Flashinfer MoE Support for Compressed Tensor NVFP4 (#21639)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-31 15:26:11 -07:00
6e672daf62 Add FlashInfer allreduce RMSNorm Quant fusion (#21069)
Signed-off-by: ilmarkov <imarkov@redhat.com>
Signed-off-by: ilmarkov <markovilya197@gmail.com>
Co-authored-by: ilmarkov <imarkov@redhat.com>
2025-07-31 13:58:38 -07:00
2dff2e21d9 [Bugfix] Fix MTP weight loading (#21941) 2025-07-31 16:33:53 -04:00
71470bc4af [Misc] Add unit tests for chunked local attention (#21692)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-07-31 11:39:16 -07:00
9e0726e5bf [Meta] Official Eagle mm support, first enablement on llama4 (#20788)
Signed-off-by: morgendave <morgendave@gmail.com>
Co-authored-by: Roger Wang <hey@rogerw.me>
2025-07-31 10:35:07 -07:00
53c21e492e Update torch_xla pin to 20250730 (#21956)
Signed-off-by: Xiongfei Wei <isaacwxf23@gmail.com>
2025-07-31 17:26:43 +00:00
0780bb5783 Removing amdproduction Tests (#22027)
Signed-off-by: Alexei V. Ivanov <alexei.ivanov@amd.com>
2025-07-31 09:53:27 -07:00
58bb902186 fix(setup): improve precompiled wheel setup for Docker builds (#22025)
Signed-off-by: dougbtv <dosmith@redhat.com>
2025-07-31 09:52:48 -07:00
7349d5268b [ez] Remove a trailing space from compilation/decorators.py (#22028) 2025-07-31 09:46:07 -07:00
9484641616 [Model] Add step3 vl (#21998)
Signed-off-by: oliveryuan <yuansong@step.ai>
Co-authored-by: oliveryuan <yuansong@step.ai>
2025-07-31 23:19:06 +08:00
207b750e19 [NVIDIA] Add SM100 Flashinfer MoE per tensor scale fp8 backend (#21458)
Signed-off-by: Amir Klein <203507526+amirkl94@users.noreply.github.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-07-31 06:00:01 -07:00
5daffe7cf6 [BugFix] Fix case where collective_rpc returns None (#22006)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-07-31 12:51:37 +00:00
2836dd73f1 [Model][CI] Let more pooling models support v1 (#21747)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-07-31 01:51:15 -07:00
d2aab336ad [CI/Build] get rid of unused VLLM_FA_CMAKE_GPU_ARCHES (#21599)
Signed-off-by: Daniele Trifirò <dtrifiro@redhat.com>
2025-07-31 15:00:08 +08:00
9532a6d563 [Deprecation] Remove deprecated args and methods (#21907)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-30 23:46:38 -07:00
3e36fcbee6 [Bugfix]: fix metadata file copy in test_sharded_state_loader (#21830)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-07-31 06:22:11 +00:00
055bd3978e [CI Bugfix] Fix CI OOM for test_shared_storage_connector_hashes (#21973)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-31 11:45:29 +08:00
0f7919fca0 [Misc] Expand SUPPORTED_HIDDEN_SIZES for DeepEP low-latency kernels (#21818)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-30 20:41:12 -07:00
61445453df [UX] Rename CUTLASS_MLA_VLLM_V1 to CUTLASS_MLA (#21966)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-30 20:40:34 -07:00
ec02e536df [Bugfix] Relax lang pin for voxtral (#21833)
Signed-off-by: Sanchit Gandhi <sgandhi3141@gmail.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-07-30 20:38:52 -07:00
9cb497bfa3 [Example] Add async_llm_streaming.py example for AsyncLLM streaming in python (#21763)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-30 18:39:46 -06:00
ca9e2be3ed [Core] Move EngineCoreRequest to Request conversion out of EngineCore (#21627)
Signed-off-by: linzebing <linzebing1995@gmail.com>
2025-07-30 15:00:54 -07:00
601f856d56 [Bugfix] Fix None value handling in trace span creation for cancelled requests (#20272) 2025-07-30 14:44:02 -07:00
287f527f54 [Feature] Add async tensor parallelism for scaled mm (#20155)
Signed-off-by: cascade812 <cascade812@outlook.com>
2025-07-30 17:23:41 -04:00
f12d9256b3 [Misc] Use dracut on CentOS and skip clone if repo exists for EP kernel installation (#21635)
Signed-off-by: Ming Yang <minos.future@gmail.com>
2025-07-30 13:15:06 -07:00
b9b753e7a7 For VLLM_USE_PRECOMPILED, only compiled .so files should be extracted (#21964) 2025-07-30 13:04:40 -07:00
56bd537dde [Misc] Support more collective_rpc return types (#21845)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-07-30 10:20:20 -07:00
8f0d516715 [TPU] Support Pathways in vLLM (#21417)
Signed-off-by: wenxindongwork <wenxindong@google.com>
2025-07-30 10:02:12 -07:00
f4135232b9 feat(distributed): add get_required_kvcache_layout class method to kv connector api (#20433)
Signed-off-by: wxsm <wxsms@foxmail.com>
2025-07-30 16:41:51 +00:00
4904e53c32 [Bugfix] SharedStorage Connector for V1 PD multimodal (#21611)
Signed-off-by: fake0fan <645327136@qq.com>
Signed-off-by: herotai214 <herotai214@gmail.com>
Co-authored-by: herotai214 <herotai214@gmail.com>
2025-07-30 09:18:37 -07:00
004203e953 [CI/Build] Fix registry tests (#21934)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-30 09:10:41 -07:00
5c765aec65 [Bugfix] Fix TypeError in scheduler when comparing mixed request_id types (#21816)
Signed-off-by: chiliu <chiliu@paypal.com>
Co-authored-by: chiliu <chiliu@paypal.com>
2025-07-30 08:54:44 -07:00
ad510309ee Override attention metadata for fast prefill in some KV sharing setups (#21590)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-07-30 08:54:15 -07:00
366f6b3a4d [Bugfix] Fix multi-api server not working for text models (#21933)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-30 08:42:05 -07:00
6e599eebe8 [Bugfix] Fix OOM tests in initialization test (#21921)
Signed-off-by: Isotr0py <2037008807@qq.com>
2025-07-30 07:35:47 -07:00
88edf5994c [Docs] Reduce the size of the built docs (#21920)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-30 07:35:08 -07:00
ff08e51940 [NVIDIA] Fix Llama4 Scout FP4 functionality issues (#21499)
Signed-off-by: Po-Han Huang <pohanh@nvidia.com>
2025-07-30 07:33:40 -07:00
8f4a1c9a04 [Misc] Improve code readability of KVCacheManager (#21673)
Signed-off-by: tanruixiang <tanruixiang0104@gmail.com>
Signed-off-by: Ruixiang Tan <819464715@qq.com>
Signed-off-by: GitHub <noreply@github.com>
2025-07-30 07:20:43 -07:00
36ede45989 Reduce time wasted in GitHub Actions using concurrency (#21919)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-30 07:18:02 -07:00
0e40b26073 [CI/Build] Only run markdownlint in CI (#21892)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-30 07:17:14 -07:00
0271c2ff2f [Test] Add Benchmark and Unit Test for per_token_group_quant (#21860)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-30 07:15:02 -07:00
e91d3c9cda [misc] skip p2p check by default (#21904) 2025-07-30 22:05:04 +08:00
bf668b5bf5 [Feature] Support multiple api keys in server (#18548)
Signed-off-by: Yan Pashkovsky <yanp.bugz@gmail.com>
2025-07-30 07:03:23 -07:00
da3e0bd6e5 [Bugfix] we should use metavar is not choices (#21902)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-07-30 06:51:58 -07:00
fcfd1eb9c5 [Doc] Remove vLLM prefix and add citation for PagedAttention (#21910)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-30 06:36:34 -07:00
d979dd6beb [Feature][EPLB] Add eplb support for Qwen3 (#20815)
Signed-off-by: aladerran <aladerran@gmail.com>
2025-07-30 06:27:57 -07:00
b876860c62 [Hardware][CPU] Build fix for ARM without BF16 (#21848)
Signed-off-by: Eric Curtin <ecurtin@redhat.com>
2025-07-30 06:22:00 -07:00
13986365a9 Add @patrickvonplaten as maintainer of mistral's related files. (#21928)
Signed-off-by: Patrick von Platen <patrick.v.platen@gmail.com>
2025-07-30 20:42:51 +08:00
5c8fe389d6 [Docs] Fix the example code of streaming chat completions in reasoning (#21825)
Signed-off-by: wangzi <3220100013@zju.edu.cn>
Co-authored-by: wangzi <3220100013@zju.edu.cn>
Co-authored-by: Zi Wang <66560864+BruceW-07@users.noreply.github.com>
2025-07-30 12:11:58 +00:00
5bbaf492a6 [Doc] Update partial support (#21916)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-30 01:32:39 -07:00
533db0935d [benchmark] add max-concurrency in result table (#21095)
Signed-off-by: Peter Pan <Peter.Pan@daocloud.io>
2025-07-30 01:15:43 -07:00
fc91da5499 [Model] Remove DSV2 unused code (#21903)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-30 00:55:03 -07:00
547795232d [Tests] Fixing bug inside MultiModalProfiler. (#21842)
Signed-off-by: Varun Shenoy <varun.vinayak.shenoy@oracle.com>
2025-07-30 00:44:15 -07:00
30ef30ed5a [CI] rollback lint-and-deploy pipeline using amd machine (#21912)
Signed-off-by: Kebe <mail@kebe7jun.com>
2025-07-30 00:37:59 -07:00
02f82fe438 [Doc] Update Intern-S1 info (#21908)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-29 23:58:57 -07:00
2ca5f82c2a [Misc] Remove redundant config definitions (#21891)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-29 23:54:18 -07:00
6f8d261882 Update vLLM Benchmark Suite for Xeon based on 0.9.2 release (#21486)
Signed-off-by: Tsai, Louie <louie.tsai@intel.com>
2025-07-30 05:57:03 +00:00
4cd7fe6cea [Docs] Expand introduction to Ray in Multi-node deployment section (#21584)
Signed-off-by: Ricardo Decal <rdecal@anyscale.com>
2025-07-29 22:07:28 -07:00
16f3250527 [CI/Build] Fix pre-commit failure in docs (#21897)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-29 21:53:08 -07:00
e3bc17ceea Add @sighingnow as maintainer of qwen's related files. (#21895)
Signed-off-by: Tao He <linzhu.ht@alibaba-inc.com>
2025-07-29 21:30:44 -07:00
05cbbe20c5 [XPU] use ZE_AFFINITY_MASK for device select on xpu (#21815)
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
2025-07-30 03:56:14 +00:00
65f311ce59 [Frontend] Add LLM.reward specific to reward models (#21720)
Signed-off-by: wang.yuqi <noooop@126.com>
2025-07-29 20:56:03 -07:00
1b0a155534 [Perf] Using __nv_fp8_e4m3 instead of c10::e4m3 for per_token_group_quant (#21867)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-29 21:50:46 -06:00
44bc46da60 [Bugfix] Actually disable processing cache when API server is scaled out (#21839)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-29 20:36:04 -07:00
b7b23da4d2 [Bugfix] Fix comment typo of get_num_common_prefix_blocks() (#21827)
Signed-off-by: MingzhenHan <hanmingzhen2002@outlook.com>
2025-07-29 20:35:33 -07:00
fdde18229e [Bugfix] Fix shape mismatch assertion error when loading Gemma3n model with BitsAndBytes quantization (#21808)
Signed-off-by: sydarb <areebsyed237@gmail.com>
2025-07-30 11:35:21 +08:00
b917da442b Expose PyTorch profiler configuration to environment variables (#21803)
Signed-off-by: Csrayz <33659823+Csrayz@users.noreply.github.com>
2025-07-29 19:46:31 -07:00
fb58e3a651 [Docs] Update docker.md with HF_TOKEN, new model, and podman fix (#21856) 2025-07-29 19:45:41 -07:00
76080cff79 [DOC] Fix path of v1 related figures (#21868)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
2025-07-29 19:45:18 -07:00
ba5c5e5404 [Docs] Switch to better markdown linting pre-commit hook (#21851)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-29 19:45:08 -07:00
555e7225bc [v1][attention] Support Hybrid Allocator + FlashInfer (#21412)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-07-30 01:45:29 +00:00
0e36abf993 [Bugfix] Correct max tokens for non-contiguous embeds (#21798)
Signed-off-by: Alexandre Milesi <30204471+milesial@users.noreply.github.com>
Co-authored-by: Alexandre Milesi <30204471+milesial@users.noreply.github.com>
2025-07-30 01:16:25 +00:00
452b2a3180 [ci] mark blackwell test optional for now (#21878) 2025-07-29 18:03:27 -07:00
0d0cc9e150 [ci] add b200 test placeholder (#21866)
Signed-off-by: simon-mo <simon.mo@hey.com>
2025-07-29 17:11:50 -07:00
9266d98048 [BugFix] Fix interleaved sliding window not set for Gemma3n (#21863)
Signed-off-by: Yong Hoon Shin <yhshin@meta.com>
2025-07-29 16:34:19 -07:00
176bbce1db Revert "[AMD][CI/Build] Fix the AMD issue caused by inappropriate of symbol exposure (#21647)" (#21850)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-07-29 21:56:29 +00:00
a1873db23d docker: docker-aware precompiled wheel support (#21127)
Signed-off-by: dougbtv <dosmith@redhat.com>
2025-07-29 14:45:19 -07:00
a33ea28b1b Add flashinfer_python to CUDA wheel requirements (#21389)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-29 12:51:58 -07:00
7b49cb1c6b [Doc] update Contributing page's testing section (#18272)
Signed-off-by: David Xia <david@davidxia.com>
2025-07-29 10:32:46 -07:00
f03e9cf2bb [Doc] Add FusedMoE Modular Kernel Documentation (#21623)
Signed-off-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
Co-authored-by: Varun Sundar Rabindranath <vsundarr@redhat.com>
2025-07-29 10:32:30 -07:00
37f86d9048 [Docs] use uv in GPU installation docs (#20277)
Signed-off-by: David Xia <david@davidxia.com>
2025-07-29 10:32:06 -07:00
58b11b24a6 [Bugfix] Fix workspace buffer None issue for Flashinfer TRTLLM Backend (#21525)
Signed-off-by: elvischenv <219235043+elvischenv@users.noreply.github.com>
2025-07-29 10:34:00 -04:00
ad341c5194 [Bugfix]fix mixed bits and visual language model quantization in AutoRound (#21802)
Signed-off-by: Wenhua Cheng <wenhua.cheng@intel.com>
2025-07-29 07:26:31 -07:00
759b87ef3e [TPU] Add an optimization doc on TPU (#21155)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-29 07:23:19 -07:00
f693b067a2 [Docs] Merge design docs for a V1 only future (#21832)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-29 07:22:50 -07:00
04e38500ee [Bugfix] VLLM_V1 supports passing other compilation levels (#19340)
Signed-off-by: Richard Zou <zou3519@gmail.com>
2025-07-29 09:35:58 -04:00
ab714131e4 [Doc] Update compatibility matrix for pooling and multimodal models (#21831)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-29 06:29:51 -07:00
755fa8b657 [KVCache] Make KVCacheSpec hashable (#21791)
Signed-off-by: Chen Zhang <zhangch99@outlook.com>
2025-07-29 19:58:29 +08:00
2470419119 [Docs] Fix the outdated URL for installing from vLLM binaries (#21523)
Signed-off-by: Kay Yan <kay.yan@daocloud.io>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-29 04:56:27 -07:00
61a6905ab0 [Model] Refactor JambaForCausalLM (#21394)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-29 18:25:07 +08:00
37efc63b64 [V0 deprecation] Guided decoding (#21347)
Signed-off-by: Reza Barazesh <rezabarazesh@meta.com>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-29 03:15:30 -07:00
a4528f0cac [Model]: Fused MoE for nomic-embed-text-v2-moe (#18321)
Signed-off-by: isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <2037008807@qq.com>
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-07-29 03:13:27 -07:00
a2480251ec [Doc] Link to RFC for pooling optimizations (#21806)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-28 23:53:18 -07:00
7234fe2685 [Misc] Rework process titles (#21780)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-07-29 05:14:47 +00:00
f1e2c095ec Migrate InternVLImageInputs and InternVLVideoInputs to TensorSchema (#21684)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-07-28 22:09:45 -07:00
12a223ef9b [AMD][CI/Build][Bugfix] Guarding CUDA specific functions by ifndef ROCM (#21766)
Signed-off-by: Gregory Shtrasberg <Gregory.Shtrasberg@amd.com>
2025-07-29 03:35:37 +00:00
e18f085103 skip fusedmoe layer for start_load_kv (#21378)
Signed-off-by: calvin chen <wen.chen@dynamia.ai>
2025-07-28 18:59:44 -07:00
afa2607596 [CI] Parallelize Kernels MoE Test (#21764)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-28 18:56:24 -07:00
48b763d6b5 [Refactor] Merge Compressed Tensor FP8 CompressedTensorsW8A8Fp8MoEMethod and CompressedTensorsW8A8Fp8MoECutlassMethod (#21775)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-28 19:47:21 -06:00
947e982ede [Docs] Minimize spacing for supported_hardware.md table (#21779) 2025-07-28 18:46:39 -07:00
c6c9122d50 [Kernel] SM90 CUTLASS FP8 GEMM: add support for swap AB + kernel tuning (#20396)
Signed-off-by: Faqin Zhong <faqin.zhong@gmail.com>
Co-authored-by: Duncan Moss <djm.moss@gmail.com>
2025-07-28 23:13:58 +00:00
8aa1485fcf [Perf] Disable chunked local attention by default with llama4 (#21761)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-07-28 18:49:04 -04:00
89ac266b26 [Feat]: Add support for Dynamic Quant 4 bit CPU kleidiai kernels (#17112)
Signed-off-by: Nikhil Gupta <nikhil.gupta2@arm.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-07-28 20:55:15 +00:00
c6f36cfa26 [Bugfix] DeepGEMM is not enabled on B200 due to _lazy_init() (#21472)
Signed-off-by: Clayton Coleman <smarterclayton@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-07-28 20:51:22 +00:00
b18b417fbf Revert "[V1] Exception Handling when Loading KV Cache from Remote Store" (#21778)
Signed-off-by: KuntaiDu <kuntai@uchicago.edu>
2025-07-28 20:15:18 +00:00
9ba1c88a93 [AMD][CI/Build] Fix the AMD issue caused by inappropriate of symbol exposure (#21647)
Signed-off-by: Lu Fang <lufang@fb.com>
2025-07-28 20:11:16 +00:00
e0e58f9729 [Bug] Enforce contiguous input for dynamic_scaled_fp8_quant and static_scaled_fp8_quant (#21773)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
2025-07-28 19:55:48 +00:00
b361f14e39 [AMD][BugFix] Fix omission of wvSplitK kernel for small batch sizes (1-4) due to torch.compile (#21350)
Signed-off-by: Randall Smith <Randall.Smith@amd.com>
2025-07-28 15:38:20 -04:00
01c753ed98 update flashinfer to v0.2.9rc2 (#21701)
Signed-off-by: Weiliang Liu <weiliangl@nvidia.com>
2025-07-28 19:31:47 +00:00
94b71ae106 Use metavar to list the choices for a CLI arg when custom values are also accepted (#21760)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-28 19:31:10 +00:00
7d44c691b0 [P/D] Log warnings related to prefill KV expiry (#21753)
Signed-off-by: Nick Hill <nhill@redhat.com>
2025-07-28 18:40:53 +00:00
e17a4d3bf9 [Bugfix] Fix granite speech shape validation (#21762)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-28 14:19:21 -04:00
ec261b0291 [XPU] IPEX-optimized Punica Wrapper on XPU (#21703)
Signed-off-by: chzhang <chaojun.zhang@intel.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-28 16:43:37 +00:00
04fe61aa3d [CI/Build] Fix plugin tests (#21758)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-28 15:08:05 +00:00
25708d317a [Bugfix] Mistral crashes on tool with no description (#21167)
Signed-off-by: HugoMichard <hugo@harfanglab.fr>
2025-07-28 08:03:35 -07:00
0e18a5d058 [Misc] Reduce logs for model resolution (#21765)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-28 07:59:56 -07:00
34a20c49b3 [Logs] Change flashinfer sampler logs to once (#21759)
Signed-off-by: mgoin <mgoin64@gmail.com>
2025-07-28 06:59:51 -07:00
31084b3b1f [Bugfix][CI/Build] Update peft version in test requirement (#21729)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-07-28 06:17:43 -07:00
bccc43c033 [Bugfix]check health for engine core process exiting unexpectedly (#21728)
Signed-off-by: wuhang <wuhang6@huawei.com>
2025-07-28 06:17:31 -07:00
1395dd9c28 [Docs] Add revision date to rendered docs (#21752)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-28 06:12:46 -07:00
9ace2eaf35 [Bugfix] Improve JSON extraction in LlamaToolParser (#19024)
Signed-off-by: keru <keyang.ru@oracle.com>
Co-authored-by: keru <keyang.ru@oracle.com>
2025-07-28 12:36:58 +00:00
656c24f1b5 [Ernie 4.5] Name Change for Base 0.3B Model (#21735)
Signed-off-by: vasqu <antonprogamer@gmail.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-28 12:22:32 +00:00
63fe3a700f [PD] let p2p nccl toy proxy handle /chat/completions (#21734)
Signed-off-by: chaunceyjiang <chaunceyjiang@gmail.com>
2025-07-28 11:45:50 +00:00
0ae970ed15 [Bugfix] Fix glm4.1v video_grid_thw tensor shape scheme (#21744)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2025-07-28 04:26:49 -07:00
65e8466c37 [Bugfix] Fix environment variable setting in CPU Dockerfile (#21730)
Signed-off-by: jiang1.li <jiang1.li@intel.com>
2025-07-28 11:02:39 +00:00
1b769dccf3 [Bugfix] Fix Ernie4_5_MoeForCausalLM shared experts (#21717)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-28 11:02:25 +00:00
2cc571199b [feature] add log non default args in LLM (#21680)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-07-28 02:21:22 -07:00
a4ed731546 [Model] Prioritize Transformers fallback over suffix matching (#21719)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-28 02:15:31 -07:00
d128d0d554 Migrate KeyeImageInputs and KeyeVideoInputs to TensorSchema (#21686)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-07-28 01:16:35 -07:00
a6c050286a [v1][mamba] Added mamba_type into MambaSpec (#21715)
Signed-off-by: asafg <asafg@ai21.com>
Co-authored-by: asafg <asafg@ai21.com>
2025-07-28 08:15:55 +00:00
139a7f07bd [BugFix] Fix ChunkedLocalAttention when the hybrid kv-cache is disabled (#21707)
Signed-off-by: Lucas Wilkinson <lwilkins@redhat.com>
2025-07-28 07:18:47 +00:00
150d9e6337 [Bugfix] fix max-file-size type from str to int (#21675)
Signed-off-by: Andy Xie <andy.xning@gmail.com>
2025-07-28 00:06:52 -07:00
139a97ec56 [Bugfix] Fix shape checking for Fuyu (#21709)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-07-28 00:05:56 -07:00
18cc33dd60 [bugfix] fix profile impact benchmark results (#21507)
Signed-off-by: rongfu.leng <rongfu.leng@daocloud.io>
2025-07-27 22:44:24 -07:00
7656cf4cf3 [Bugfix] [issue-21565] Fix the incompatibility issue with stream and named function calling when Thinking is disabled (#21573)
Signed-off-by: wangzi <3220100013@zju.edu.cn>
Co-authored-by: wangzi <3220100013@zju.edu.cn>
2025-07-27 22:43:50 -07:00
3ea57a56d9 Migrate Idefics3ImagePixelInputs and Idefics3ImageEmbeddingInputs to … (#21683)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-07-27 22:37:23 -07:00
75856bc2cb Migrate GraniteSpeechAudioInputs to TensorSchema (#21682)
Signed-off-by: Benji Beck <benjibeck@meta.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-07-27 22:37:20 -07:00
304dcdf575 Migrate GLMVImagePixelInputs to TensorSchema (#21679)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-07-27 22:36:11 -07:00
88e46c7c8d Migrate Glm4vImageInputs, Glm4vVideoInputs to TensorSchema (#21678)
Signed-off-by: Benji Beck <benjibeck@meta.com>
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
2025-07-27 22:36:08 -07:00
d8937de4c8 Migrate Gemma3ImagePixelInputs to TensorSchema (#21676)
Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-07-27 22:36:05 -07:00
e626d286f5 [FEAT] [ROCm] [AITER]: Add AITER HIP block quant kernel (#21242) 2025-07-28 05:07:06 +00:00
c7ffe93d9c [Model] Support TP/PP/mamba2 kernel for PLaMo2 (#19674)
Signed-off-by: Shinichi Hemmi <shemmi@preferred.jp>
Signed-off-by: Shinichi Hemmi <50256998+Alnusjaponica@users.noreply.github.com>
Co-authored-by: Calvin Metzger <metzger@preferred.jp>
Co-authored-by: Sixue Wang <cecilwang@preferred.jp>
2025-07-28 05:00:47 +00:00
15a72ac478 [V1] Exception Handling when Loading KV Cache from Remote Store (#21534)
Signed-off-by: liuyumoye <adeline_ly2023@outlook.com>
Co-authored-by: liuyumoye <adeline_ly2023@outlook.com>
2025-07-27 20:34:17 -07:00
04ff4be310 [Misc] Add fused_moe configs for Qwen3-Coder-480B-A35B-Instruct-FP8 (#21700)
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
2025-07-27 20:12:18 -07:00
93269bb43e Fix GLM tool parser (#21668)
Co-authored-by: Chenhui Zhang <zhang.chenhui@outlook.com>
2025-07-28 10:46:38 +08:00
82acf2184d Fix typo for limit-mm-per-prompt in docs (#21697)
Signed-off-by: Joachim Studnia <joachim@mistral.ai>
2025-07-27 19:45:37 -07:00
86ae693f20 [Deprecation][2/N] Replace --task with --runner and --convert (#21470)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-07-27 19:42:40 -07:00
8f605ee309 [Attention] Make CutlassMLA the default backend for SM100 (blackwell) (#21626)
Signed-off-by: Alexander Matveev <amatveev@redhat.com>
Signed-off-by: mgoin <mgoin64@gmail.com>
Co-authored-by: mgoin <mgoin64@gmail.com>
2025-07-27 20:13:00 +00:00
1062 changed files with 59089 additions and 30981 deletions

View File

@ -7,7 +7,7 @@ This directory contains two sets of benchmark for vllm.
- Performance benchmark: benchmark vllm's performance under various workload, for **developers** to gain clarity on whether their PR improves/degrades vllm's performance
- Nightly benchmark: compare vllm's performance against alternatives (tgi, trt-llm and lmdeploy), for **the public** to know when to choose vllm.
See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
See [vLLM performance dashboard](https://hud.pytorch.org/benchmark/llms?repoName=vllm-project%2Fvllm) for the latest performance benchmark results and [vLLM GitHub README](https://github.com/vllm-project/vllm/blob/main/README.md) for latest nightly benchmark results.
## Performance benchmark quick overview
@ -28,6 +28,7 @@ See [vLLM performance dashboard](https://perf.vllm.ai) for the latest performanc
## Trigger the benchmark
Performance benchmark will be triggered when:
- A PR being merged into vllm.
- Every commit for those PRs with `perf-benchmarks` label AND `ready` label.
@ -38,6 +39,7 @@ bash .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
```
Runtime environment variables:
- `ON_CPU`: set the value to '1' on Intel® Xeon® Processors. Default value is 0.
- `SERVING_JSON`: JSON file to use for the serving tests. Default value is empty string (use default file).
- `LATENCY_JSON`: JSON file to use for the latency tests. Default value is empty string (use default file).
@ -46,12 +48,14 @@ Runtime environment variables:
- `REMOTE_PORT`: Port for the remote vLLM service to benchmark. Default value is empty string.
Nightly benchmark will be triggered when:
- Every commit for those PRs with `perf-benchmarks` label and `nightly-benchmarks` label.
## Performance benchmark details
See [performance-benchmarks-descriptions.md](performance-benchmarks-descriptions.md) for detailed descriptions, and use `tests/latency-tests.json`, `tests/throughput-tests.json`, `tests/serving-tests.json` to configure the test cases.
> NOTE: For Intel® Xeon® Processors, use `tests/latency-tests-cpu.json`, `tests/throughput-tests-cpu.json`, `tests/serving-tests-cpu.json` instead.
>
### Latency test
Here is an example of one test inside `latency-tests.json`:
@ -100,7 +104,6 @@ We test the throughput by using `vllm bench serve` with request rate = inf to co
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
@ -136,26 +139,19 @@ The raw benchmarking results (in the format of json files) are in the `Artifacts
The `compare-json-results.py` helps to compare benchmark results JSON files converted using `convert-results-json-to-markdown.py`.
When run, benchmark script generates results under `benchmark/results` folder, along with the `benchmark_results.md` and `benchmark_results.json`.
`compare-json-results.py` compares two `benchmark_results.json` files and provides performance ratio e.g. for Output Tput, Median TTFT and Median TPOT.
If only one benchmark_results.json is passed, `compare-json-results.py` compares different TP and PP configurations in the benchmark_results.json instead.
Here is an example using the script to compare result_a and result_b without detail test name.
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json --ignore_test_name`
| | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
|----|----------------------------------------|----------------------------------------|----------|
| 0 | 142.633982 | 156.526018 | 1.097396 |
| 1 | 241.620334 | 294.018783 | 1.216863 |
| 2 | 218.298905 | 262.664916 | 1.203235 |
| 3 | 242.743860 | 299.816190 | 1.235113 |
Here is an example using the script to compare result_a and result_b with detail test name.
Here is an example using the script to compare result_a and result_b with Model, Dataset name, input/output lenght, max concurrency and qps.
`python3 compare-json-results.py -f results_a/benchmark_results.json -f results_b/benchmark_results.json`
| | results_a/benchmark_results.json_name | results_a/benchmark_results.json | results_b/benchmark_results.json_name | results_b/benchmark_results.json | perf_ratio |
|---|---------------------------------------------|----------------------------------------|---------------------------------------------|----------------------------------------|----------|
| 0 | serving_llama8B_tp1_sharegpt_qps_1 | 142.633982 | serving_llama8B_tp1_sharegpt_qps_1 | 156.526018 | 1.097396 |
| 1 | serving_llama8B_tp1_sharegpt_qps_16 | 241.620334 | serving_llama8B_tp1_sharegpt_qps_16 | 294.018783 | 1.216863 |
| 2 | serving_llama8B_tp1_sharegpt_qps_4 | 218.298905 | serving_llama8B_tp1_sharegpt_qps_4 | 262.664916 | 1.203235 |
| 3 | serving_llama8B_tp1_sharegpt_qps_inf | 242.743860 | serving_llama8B_tp1_sharegpt_qps_inf | 299.816190 | 1.235113 |
| 4 | serving_llama8B_tp2_random_1024_128_qps_1 | 96.613390 | serving_llama8B_tp4_random_1024_128_qps_1 | 108.404853 | 1.122048 |
| | Model | Dataset Name | Input Len | Output Len | # of max concurrency | qps | results_a/benchmark_results.json | results_b/benchmark_results.json | perf_ratio |
|----|---------------------------------------|--------|-----|-----|------|-----|-----------|----------|----------|
| 0 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | 1 | 142.633982 | 156.526018 | 1.097396 |
| 1 | meta-llama/Meta-Llama-3.1-8B-Instruct | random | 128 | 128 | 1000 | inf| 241.620334 | 294.018783 | 1.216863 |
A comparison diagram will be generated below the table.
Here is an example to compare between 96c/results_gnr_96c_091_tp2pp3 and 128c/results_gnr_128c_091_tp2pp3
<img width="1886" height="828" alt="image" src="https://github.com/user-attachments/assets/c02a43ef-25d0-4fd6-90e5-2169a28682dd" />
## Nightly test details
@ -164,9 +160,9 @@ See [nightly-descriptions.md](nightly-descriptions.md) for the detailed descript
### Workflow
- The [nightly-pipeline.yaml](nightly-pipeline.yaml) specifies the docker containers for different LLM serving engines.
- Inside each container, we run [run-nightly-suite.sh](run-nightly-suite.sh), which will probe the serving engine of the current container.
- The `run-nightly-suite.sh` will redirect the request to `tests/run-[llm serving engine name]-nightly.sh`, which parses the workload described in [nightly-tests.json](tests/nightly-tests.json) and performs the benchmark.
- At last, we run [scripts/plot-nightly-results.py](scripts/plot-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
- Inside each container, we run [scripts/run-nightly-benchmarks.sh](scripts/run-nightly-benchmarks.sh), which will probe the serving engine of the current container.
- The `scripts/run-nightly-benchmarks.sh` will parse the workload described in [nightly-tests.json](tests/nightly-tests.json) and launch the right benchmark for the specified serving engine via `scripts/launch-server.sh`.
- At last, we run [scripts/summary-nightly-results.py](scripts/summary-nightly-results.py) to collect and plot the final benchmarking results, and update the results to buildkite.
### Nightly tests
@ -176,6 +172,6 @@ In [nightly-tests.json](tests/nightly-tests.json), we include the command line a
The docker containers for benchmarking are specified in `nightly-pipeline.yaml`.
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `tests/run-[llm serving engine name]-nightly.sh`.
WARNING: the docker versions are HARD-CODED and SHOULD BE ALIGNED WITH `nightly-descriptions.md`. The docker versions need to be hard-coded as there are several version-specific bug fixes inside `scripts/run-nightly-benchmarks.sh` and `scripts/launch-server.sh`.
WARNING: populating `trt-llm` to latest version is not easy, as it requires updating several protobuf files in [tensorrt-demo](https://github.com/neuralmagic/tensorrt-demo.git).

View File

@ -1,3 +1,4 @@
# Nightly benchmark annotation
## Description
@ -13,15 +14,15 @@ Please download the visualization scripts in the post
- Find the docker we use in `benchmarking pipeline`
- Deploy the docker, and inside the docker:
- Download `nightly-benchmarks.zip`.
- In the same folder, run the following code:
- Download `nightly-benchmarks.zip`.
- In the same folder, run the following code:
```bash
export HF_TOKEN=<your HF token>
apt update
apt install -y git
unzip nightly-benchmarks.zip
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
```
```bash
export HF_TOKEN=<your HF token>
apt update
apt install -y git
unzip nightly-benchmarks.zip
VLLM_SOURCE_CODE_LOC=./ bash .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
```
And the results will be inside `./benchmarks/results`.

View File

@ -13,25 +13,25 @@ Latest reproduction guilde: [github issue link](https://github.com/vllm-project/
## Setup
- Docker images:
- vLLM: `vllm/vllm-openai:v0.6.2`
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
- *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
- vLLM: `vllm/vllm-openai:v0.6.2`
- SGLang: `lmsysorg/sglang:v0.3.2-cu121`
- LMDeploy: `openmmlab/lmdeploy:v0.6.1-cu12`
- TensorRT-LLM: `nvcr.io/nvidia/tritonserver:24.07-trtllm-python-py3`
- *NOTE: we uses r24.07 as the current implementation only works for this version. We are going to bump this up.*
- Check [nightly-pipeline.yaml](nightly-pipeline.yaml) for the concrete docker images, specs and commands we use for the benchmark.
- Hardware
- 8x Nvidia A100 GPUs
- 8x Nvidia A100 GPUs
- Workload:
- Dataset
- ShareGPT dataset
- Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
- Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
- Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
- Models: llama-3 8B, llama-3 70B.
- We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
- Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
- Dataset
- ShareGPT dataset
- Prefill-heavy dataset (in average 462 input tokens, 16 tokens as output)
- Decode-heavy dataset (in average 462 input tokens, 256 output tokens)
- Check [nightly-tests.json](tests/nightly-tests.json) for the concrete configuration of datasets we use.
- Models: llama-3 8B, llama-3 70B.
- We do not use llama 3.1 as it is incompatible with trt-llm r24.07. ([issue](https://github.com/NVIDIA/TensorRT-LLM/issues/2105)).
- Average QPS (query per second): 2, 4, 8, 16, 32 and inf.
- Queries are randomly sampled, and arrival patterns are determined via Poisson process, but all with fixed random seed.
- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
## Known issues

View File

@ -1,3 +1,4 @@
# Performance benchmarks descriptions
## Latency tests

View File

@ -1,24 +1,38 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import json
import os
import pandas as pd
def compare_data_columns(
files, name_column, data_column, drop_column, ignore_test_name=False
files, name_column, data_column, info_cols, drop_column, debug=False
):
print("\ncompare_data_column: " + data_column)
frames = []
raw_data_cols = []
compare_frames = []
for file in files:
data_df = pd.read_json(file)
serving_df = data_df.dropna(subset=[drop_column], ignore_index=True)
if ignore_test_name is False:
# Show all info columns in the first couple columns
if not frames:
for col in info_cols:
if col not in serving_df.columns:
print(f"Skipping missing column: {col}")
continue
frames.append(serving_df[col])
# only show test name under debug mode
if debug is True:
serving_df = serving_df.rename(columns={name_column: file + "_name"})
frames.append(serving_df[file + "_name"])
file = "/".join(file.split("/")[:-1])
serving_df = serving_df.rename(columns={data_column: file})
frames.append(serving_df[file])
raw_data_cols.append(file)
compare_frames.append(serving_df[file])
if len(compare_frames) >= 2:
# Compare numbers among two files
@ -27,7 +41,68 @@ def compare_data_columns(
compare_frames.pop(1)
concat_df = pd.concat(frames, axis=1)
return concat_df
print(raw_data_cols)
return concat_df, raw_data_cols
def split_json_by_tp_pp(
input_file: str = "benchmark_results.json", output_root: str = "."
) -> list[str]:
"""
Split a benchmark JSON into separate folders by (TP Size, PP Size).
Creates: <output_root>/tp{TP}_pp{PP}/benchmark_results.json
Returns: list of file paths written.
"""
# Load JSON data into DataFrame
with open(input_file, encoding="utf-8") as f:
data = json.load(f)
# If the JSON is a dict with a list under common keys, use that list
if isinstance(data, dict):
for key in ("results", "serving_results", "benchmarks", "data"):
if isinstance(data.get(key), list):
data = data[key]
break
df = pd.DataFrame(data)
# Handle alias column names
rename_map = {
"tp_size": "TP Size",
"tensor_parallel_size": "TP Size",
"pp_size": "PP Size",
"pipeline_parallel_size": "PP Size",
}
df.rename(
columns={k: v for k, v in rename_map.items() if k in df.columns}, inplace=True
)
# Ensure TP/PP columns exist (default to 1 if missing)
if "TP Size" not in df.columns:
df["TP Size"] = 1
if "PP Size" not in df.columns:
df["PP Size"] = 1
# make sure TP/PP are numeric ints with no NaN
df["TP Size"] = (
pd.to_numeric(df.get("TP Size", 1), errors="coerce").fillna(1).astype(int)
)
df["PP Size"] = (
pd.to_numeric(df.get("PP Size", 1), errors="coerce").fillna(1).astype(int)
)
# Split into separate folders
saved_paths: list[str] = []
for (tp, pp), group_df in df.groupby(["TP Size", "PP Size"], dropna=False):
folder_name = os.path.join(output_root, f"tp{int(tp)}_pp{int(pp)}")
os.makedirs(folder_name, exist_ok=True)
filepath = os.path.join(folder_name, "benchmark_results.json")
group_df.to_json(filepath, orient="records", indent=2, force_ascii=False)
print(f"Saved: {filepath}")
saved_paths.append(filepath)
return saved_paths
if __name__ == "__main__":
@ -36,31 +111,105 @@ if __name__ == "__main__":
"-f", "--file", action="append", type=str, help="input file name"
)
parser.add_argument(
"--ignore_test_name", action="store_true", help="ignore_test_name or not"
"--debug", action="store_true", help="show all information for debugging"
)
parser.add_argument(
"--plot",
action=argparse.BooleanOptionalAction,
default=True,
help="plot perf diagrams or not --no-plot --plot",
)
parser.add_argument(
"-x",
"--xaxis",
type=str,
default="# of max concurrency.",
help="column name to use as X Axis in comparision graph",
)
args = parser.parse_args()
files = args.file
print("comparing : " + ", ".join(files))
drop_column = "P99"
name_column = "Test name"
info_cols = [
"Model",
"Dataset Name",
"Input Len",
"Output Len",
"TP Size",
"PP Size",
"# of max concurrency.",
"qps",
]
data_cols_to_compare = ["Output Tput (tok/s)", "Median TTFT (ms)", "Median"]
html_msgs_for_data_cols = [
"Compare Output Tokens /n",
"Median TTFT /n",
"Median TPOT /n",
]
ignore_test_name = args.ignore_test_name
if len(args.file) == 1:
files = split_json_by_tp_pp(args.file[0], output_root="splits")
info_cols = [c for c in info_cols if c not in ("TP Size", "PP Size")]
else:
files = args.file
print("comparing : " + ", ".join(files))
debug = args.debug
plot = args.plot
# For Plot feature, assign y axis from one of info_cols
y_axis_index = info_cols.index(args.xaxis) if args.xaxis in info_cols else 6
with open("perf_comparison.html", "w") as text_file:
for i in range(len(data_cols_to_compare)):
output_df = compare_data_columns(
output_df, raw_data_cols = compare_data_columns(
files,
name_column,
data_cols_to_compare[i],
info_cols,
drop_column,
ignore_test_name=ignore_test_name,
debug=debug,
)
print(output_df)
html = output_df.to_html()
text_file.write(html_msgs_for_data_cols[i])
text_file.write(html)
# For Plot feature, insert y axis from one of info_cols
raw_data_cols.insert(0, info_cols[y_axis_index])
filtered_info_cols = info_cols[:-2]
existing_group_cols = [
c for c in filtered_info_cols if c in output_df.columns
]
if not existing_group_cols:
raise ValueError(
f"No valid group-by columns "
f"Expected subset: {filtered_info_cols}, "
f"but DataFrame has: {list(output_df.columns)}"
)
output_df_sorted = output_df.sort_values(by=existing_group_cols)
output_groups = output_df_sorted.groupby(existing_group_cols, dropna=False)
for name, group in output_groups:
html = group.to_html()
text_file.write(html_msgs_for_data_cols[i])
text_file.write(html)
if plot is True:
import pandas as pd
import plotly.express as px
df = group[raw_data_cols]
df_sorted = df.sort_values(by=info_cols[y_axis_index])
# Melt DataFrame for plotting
df_melted = df_sorted.melt(
id_vars=info_cols[y_axis_index],
var_name="Configuration",
value_name=data_cols_to_compare[i],
)
title = data_cols_to_compare[i] + " vs " + info_cols[y_axis_index]
# Create Plotly line chart
fig = px.line(
df_melted,
x=info_cols[y_axis_index],
y=data_cols_to_compare[i],
color="Configuration",
title=title,
markers=True,
)
# Export to HTML
text_file.write(fig.to_html(full_html=True, include_plotlyjs="cdn"))

View File

@ -1,17 +1,19 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import json
import os
import shlex
from importlib import util
from pathlib import Path
from typing import Any
import pandas as pd
import psutil
import regex as re
from tabulate import tabulate
results_folder = Path("results/")
# latency results and the keys that will be printed into markdown
latency_results = []
latency_column_mapping = {
@ -42,13 +44,22 @@ throughput_results_column_mapping = {
serving_results = []
serving_column_mapping = {
"test_name": "Test name",
"model_id": "Model",
"dataset_name": "Dataset Name",
"input_len": "Input Len",
"output_len": "Output Len",
"tp_size": "TP Size",
"pp_size": "PP Size",
"dtype": "dtype",
"gpu_type": "GPU",
"completed": "# of req.",
"qps": "qps",
"max_concurrency": "# of max concurrency.",
"request_throughput": "Tput (req/s)",
"total_token_throughput": "Total Token Tput (tok/s)",
"output_throughput": "Output Tput (tok/s)",
"total_input_tokens": "Total input tokens",
"total_output_tokens": "Total output tokens",
# "total_input_tokens": "Total input tokens",
# "total_output_tokens": "Total output tokens",
"mean_ttft_ms": "Mean TTFT (ms)",
"median_ttft_ms": "Median TTFT (ms)",
"p99_ttft_ms": "P99 TTFT (ms)",
@ -93,7 +104,104 @@ def get_size_with_unit(bytes, suffix="B"):
bytes /= factor
def _coerce(val: str) -> Any:
"""Best-effort type coercion from string to Python types."""
low = val.lower()
if low == "null":
return None
if low == "true":
return True
if low == "false":
return False
# integers
if re.fullmatch(r"[+-]?\d+", val):
try:
return int(val)
except ValueError:
pass
# floats (keep 'inf'/'-inf'/'nan' as strings)
if re.fullmatch(r"[+-]?\d*\.\d+", val):
try:
return float(val)
except ValueError:
pass
return val
def parse_client_command(cmd: str) -> dict[str, Any]:
"""Parse the client_command shell string into {executable, script, args}."""
toks = shlex.split(cmd)
if len(toks) < 2:
raise ValueError("client_command must include an executable and a script")
executable, script = toks[0], toks[1]
args: dict[str, Any] = {}
i = 2
while i < len(toks):
t = toks[i]
if t.startswith("--"):
# --key=value or --key (value) or boolean flag
if "=" in t:
key, val = t.split("=", 1)
if key == "--metadata":
md = {}
if val:
if "=" in val:
k, v = val.split("=", 1)
md[k] = _coerce(v)
else:
md[val] = True
args[key] = md
else:
args[key] = _coerce(val)
i += 1
continue
key = t
# Special: consume metadata k=v pairs until next --flag
if key == "--metadata":
i += 1
md = {}
while i < len(toks) and not toks[i].startswith("--"):
pair = toks[i]
if "=" in pair:
k, v = pair.split("=", 1)
md[k] = _coerce(v)
else:
md[pair] = True
i += 1
args[key] = md
continue
# Standard: check if next token is a value (not a flag)
if i + 1 < len(toks) and not toks[i + 1].startswith("--"):
args[key] = _coerce(toks[i + 1])
i += 2
else:
# lone flag -> True
args[key] = True
i += 1
else:
# unexpected positional; skip
i += 1
return {"executable": executable, "script": script, "args": args}
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-r",
"--result",
type=str,
default="results",
help="Folder name for benchmark output results.",
)
args = parser.parse_args()
results_folder = Path(args.result)
if not results_folder.exists():
raise FileNotFoundError(f"results folder does not exist: {results_folder}")
# collect results
for test_file in results_folder.glob("*.json"):
with open(test_file) as f:
@ -101,7 +209,6 @@ if __name__ == "__main__":
if "serving" in str(test_file):
# this result is generated via `vllm bench serve` command
# attach the benchmarking command to raw_result
try:
with open(test_file.with_suffix(".commands")) as f:
@ -109,12 +216,44 @@ if __name__ == "__main__":
except OSError as e:
print(e)
continue
# Parse Server Command Arg
out: dict[str, Any] = {
"server_command": parse_client_command(command["server_command"])
}
parse_args = [
"--tensor-parallel-size",
"--pipeline-parallel-size",
"--dtype",
]
col_mapping = ["tp_size", "pp_size", "dtype"]
for index, arg in enumerate(parse_args):
if arg in out["server_command"]["args"]:
raw_result.update(
{col_mapping[index]: out["server_command"]["args"][arg]}
)
# Parse Client Command Arg
out: dict[str, Any] = {
"client_command": parse_client_command(command["client_command"])
}
parse_args = [
"--dataset-name",
"--random-input-len",
"--random-output-len",
"--request-rate",
]
col_mapping = ["dataset_name", "input_len", "output_len", "qps"]
for index, arg in enumerate(parse_args):
if arg in out["client_command"]["args"]:
raw_result.update(
{col_mapping[index]: out["client_command"]["args"][arg]}
)
# Add Server, Client command
raw_result.update(command)
# update the test name of this result
raw_result.update({"test_name": test_file.stem})
# add the result to raw_result
serving_results.append(raw_result)
continue
@ -204,7 +343,10 @@ if __name__ == "__main__":
columns=latency_column_mapping
)
if not serving_results.empty:
serving_results = serving_results[list(serving_column_mapping.keys())].rename(
valid_columns = [
col for col in serving_column_mapping if col in serving_results.columns
]
serving_results = serving_results[valid_columns].rename(
columns=serving_column_mapping
)
if not throughput_results.empty:
@ -244,7 +386,9 @@ if __name__ == "__main__":
)
# document the result
with open(results_folder / "benchmark_results.md", "w") as f:
md_file = "benchmark_results.md"
json_file = "benchmark_results.json"
with open(results_folder / md_file, "w") as f:
results = read_markdown(
"../.buildkite/nightly-benchmarks/"
+ "performance-benchmarks-descriptions.md"
@ -259,7 +403,7 @@ if __name__ == "__main__":
f.write(results)
# document benchmarking results in json
with open(results_folder / "benchmark_results.json", "w") as f:
with open(results_folder / json_file, "w") as f:
results = (
latency_results.to_dict(orient="records")
+ throughput_results.to_dict(orient="records")

View File

@ -33,7 +33,7 @@ check_gpus() {
check_cpus() {
# check the number of CPUs and NUMA Node and GPU type.
declare -g numa_count=$(python3 -c "from numa import info;numa_size = info.get_num_configured_nodes(); print(numa_size)")
declare -g numa_count=$(lscpu | grep "NUMA node(s):" | awk '{print $3}')
if [[ $numa_count -gt 0 ]]; then
echo "NUMA found."
echo $numa_count
@ -194,9 +194,11 @@ run_latency_tests() {
# check if there is enough GPU to run the test
tp=$(echo "$latency_params" | jq -r '.tensor_parallel_size')
if [ "$ON_CPU" == "1" ];then
if [[ $numa_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
if [ "$ON_CPU" == "1" ]; then
pp=$(echo "$latency_params" | jq -r '.pipeline_parallel_size')
world_size=$(($tp*$pp))
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
continue
fi
else
@ -261,9 +263,11 @@ run_throughput_tests() {
# check if there is enough GPU to run the test
tp=$(echo "$throughput_params" | jq -r '.tensor_parallel_size')
if [ "$ON_CPU" == "1" ];then
if [[ $numa_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
if [ "$ON_CPU" == "1" ]; then
pp=$(echo "$throughput_params" | jq -r '.pipeline_parallel_size')
world_size=$(($tp*$pp))
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
continue
fi
else
@ -329,12 +333,21 @@ run_serving_tests() {
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"
max_concurrency_list=$(echo "$params" | jq -r '.max_concurrency_list')
if [[ -z "$max_concurrency_list" || "$max_concurrency_list" == "null" ]]; then
num_prompts=$(echo "$client_params" | jq -r '.num_prompts')
max_concurrency_list="[$num_prompts]"
fi
max_concurrency_list=$(echo "$max_concurrency_list" | jq -r '.[] | @sh')
echo "Running over max concurrency list $max_concurrency_list"
# check if there is enough resources to run the test
tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
if [ "$ON_CPU" == "1" ];then
if [[ $numa_count -lt $tp ]]; then
echo "Required tensor-parallel-size $tp but only $numa_count NUMA nodes found. Skip testcase $test_name."
if [ "$ON_CPU" == "1" ]; then
pp=$(echo "$server_params" | jq -r '.pipeline_parallel_size')
world_size=$(($tp*$pp))
if [[ $numa_count -lt $world_size && -z "${REMOTE_HOST}" ]]; then
echo "Required world-size $world_size but only $numa_count NUMA nodes found. Skip testcase $test_name."
continue
fi
else
@ -390,35 +403,39 @@ run_serving_tests() {
echo "now qps is $qps"
fi
new_test_name=$test_name"_qps_"$qps
# iterate over different max_concurrency
for max_concurrency in $max_concurrency_list; do
new_test_name=$test_name"_qps_"$qps"_concurrency_"$max_concurrency
echo " new test name $new_test_name"
# pass the tensor parallel size to the client so that it can be displayed
# on the benchmark dashboard
client_command="vllm bench serve \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--max-concurrency $max_concurrency \
--metadata "tensor_parallel_size=$tp" \
$client_args $client_remote_args "
# pass the tensor parallel size to the client so that it can be displayed
# on the benchmark dashboard
client_command="vllm bench serve \
--save-result \
--result-dir $RESULTS_FOLDER \
--result-filename ${new_test_name}.json \
--request-rate $qps \
--metadata "tensor_parallel_size=$tp" \
$client_args $client_remote_args "
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
echo "Running test case $test_name with qps $qps"
echo "Client command: $client_command"
bash -c "$client_command"
bash -c "$client_command"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
# record the benchmarking commands
jq_output=$(jq -n \
--arg server "$server_command" \
--arg client "$client_command" \
--arg gpu "$gpu_type" \
'{
server_command: $server,
client_command: $client,
gpu_type: $gpu
}')
echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
done
done
# clean up

View File

@ -11,9 +11,7 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},

View File

@ -6,7 +6,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"num_iters_warmup": 5,
@ -20,7 +20,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"num_iters_warmup": 5,

View File

@ -35,9 +35,7 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
@ -90,9 +88,7 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
@ -145,9 +141,7 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
@ -197,9 +191,7 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
@ -251,9 +243,7 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
@ -305,9 +295,7 @@
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},

View File

@ -0,0 +1,202 @@
[
{
"test_name": "serving_llama8B_tp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp2_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp4_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_tp2_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_tp4_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"num_prompts": 1000
}
}
]

View File

@ -0,0 +1,205 @@
[
{
"test_name": "serving_llama8B_pp1_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_pp3_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp2pp3_sharegpt",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_pp1_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_pp3_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
},
{
"test_name": "serving_llama8B_tp2pp3_random_128_128",
"qps_list": ["inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"pipeline_parallel_size": 3,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 128,
"random-output-len": 128,
"ignore-eos": "",
"num_prompts": 1000
}
}
]

View File

@ -2,104 +2,112 @@
{
"test_name": "serving_llama8B_tp1_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"max_concurrency": 60,
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp2_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 2,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"max_concurrency": 60,
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp4_sharegpt",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
"block_size": 128,
"trust_remote_code": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "sharegpt",
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
"max_concurrency": 60,
"num_prompts": 200
}
},
{
"test_name": "serving_llama8B_tp4_random_1024_128",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@ -107,32 +115,34 @@
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 1024,
"random-output-len": 128,
"ignore-eos": "",
"max_concurrency": 100,
"num_prompts": 100
}
},
{
"test_name": "serving_llama8B_pp6_random_1024_128",
"qps_list": [1, 4, 16, "inf"],
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
"server_environment_variables": {
"VLLM_RPC_TIMEOUT": 100000,
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
"VLLM_CPU_SGL_KERNEL": 1,
"VLLM_CPU_KVCACHE_SPACE": 40
},
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"pipeline_parallel_size": 6,
"dtype": "bfloat16",
"distributed_executor_backend": "mp",
@ -140,18 +150,18 @@
"trust_remote_code": "",
"enable_chunked_prefill": "",
"disable_log_stats": "",
"disable_log_requests": "",
"enforce_eager": "",
"max_num_batched_tokens": 2048,
"max_num_seqs": 256,
"load_format": "dummy"
},
"client_parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"backend": "vllm",
"dataset_name": "random",
"random-input-len": 1024,
"random-output-len": 128,
"ignore-eos": "",
"max_concurrency": 100,
"num_prompts": 100
}
}

View File

@ -7,7 +7,6 @@
"tensor_parallel_size": 1,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
@ -26,7 +25,6 @@
"tensor_parallel_size": 4,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
@ -45,7 +43,6 @@
"tensor_parallel_size": 2,
"swap_space": 16,
"disable_log_stats": "",
"disable_log_requests": "",
"load_format": "dummy"
},
"client_parameters": {
@ -61,7 +58,6 @@
"qps_list": [2],
"server_parameters": {
"model": "meta-llama/Meta-Llama-3.1-70B-Instruct",
"disable_log_requests": "",
"tensor_parallel_size": 4,
"swap_space": 16,
"speculative_config": {

View File

@ -6,7 +6,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 1,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@ -21,7 +21,7 @@
"VLLM_CPU_KVCACHE_SPACE": 40
},
"parameters": {
"model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"model": "meta-llama/Llama-3.1-8B-Instruct",
"tensor_parallel_size": 4,
"load_format": "dummy",
"dataset": "./ShareGPT_V3_unfiltered_cleaned_split.json",

View File

@ -1,4 +1,20 @@
steps:
# aarch64 + CUDA builds
- label: "Build arm64 wheel - CUDA 12.8"
id: build-wheel-arm64-cuda-12-8
agents:
queue: arm64_cpu_queue_postmerge
commands:
# #NOTE: torch_cuda_arch_list is derived from upstream PyTorch build files here:
# https://github.com/pytorch/pytorch/blob/main/.ci/aarch64_linux/aarch64_ci_build.sh#L7
- "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.8.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX' --tag vllm-ci:build-image --target build --progress plain -f docker/Dockerfile ."
- "mkdir artifacts"
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
- "bash .buildkite/scripts/upload-wheels.sh"
env:
DOCKER_BUILDKIT: "1"
# x86 + CUDA builds
- label: "Build wheel - CUDA 12.8"
id: build-wheel-cuda-12-8
agents:

View File

@ -121,7 +121,6 @@ fi
if [[ $commands == *" kernels/quantization"* ]]; then
commands="${commands} \
--ignore=kernels/quantization/test_int8_quant.py \
--ignore=kernels/quantization/test_aqlm.py \
--ignore=kernels/quantization/test_machete_mm.py \
--ignore=kernels/quantization/test_block_fp8.py \
--ignore=kernels/quantization/test_block_int8.py \

View File

@ -78,6 +78,12 @@ function cpu_tests() {
# VLLM_USE_V1=0 pytest -s -v \
# tests/quantization/test_ipex_quant.py"
# Run multi-lora tests
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
tests/lora/test_qwen2vl.py"
# online serving
docker exec cpu-test-"$NUMA_NODE" bash -c '
set -e
@ -89,12 +95,6 @@ function cpu_tests() {
--model meta-llama/Llama-3.2-3B-Instruct \
--num-prompts 20 \
--endpoint /v1/completions'
# Run multi-lora tests
docker exec cpu-test-"$NUMA_NODE" bash -c "
set -e
pytest -s -v \
tests/lora/test_qwen2vl.py"
}
# All of CPU tests are expected to be finished less than 40 mins.

View File

@ -16,8 +16,7 @@ DOCKER_BUILDKIT=1 docker build . \
--build-arg max_jobs=66 \
--build-arg nvcc_threads=2 \
--build-arg RUN_WHEEL_CHECK=false \
--build-arg torch_cuda_arch_list="9.0+PTX" \
--build-arg vllm_fa_cmake_gpu_arches="90-real"
--build-arg torch_cuda_arch_list="9.0+PTX"
# Setup cleanup
remove_docker_container() { docker rm -f gh200-test || true; }

View File

@ -5,7 +5,6 @@ set -xu
remove_docker_container() {
docker rm -f tpu-test || true;
docker rm -f vllm-tpu || true;
}
trap remove_docker_container EXIT
@ -129,7 +128,7 @@ run_and_track_test() {
# --- Actual Test Execution ---
run_and_track_test 1 "test_struct_output_generate.py" \
"HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
"python3 -m pytest -s -v /workspace/vllm/tests/v1/entrypoints/llm/test_struct_output_generate.py -k \"not test_structured_output_with_reasoning_matrices\""
run_and_track_test 2 "test_moe_pallas.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_moe_pallas.py"
run_and_track_test 3 "test_lora.py" \
@ -140,6 +139,8 @@ run_and_track_test 5 "test_spmd_model_weight_loading.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_spmd_model_weight_loading.py"
run_and_track_test 6 "test_kv_cache_update_kernel.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_kv_cache_update_kernel.py"
run_and_track_test 7 "test_tpu_int8.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_tpu_int8.py"
# After all tests have been attempted, exit with the overall status.
if [ "$overall_script_exit_code" -ne 0 ]; then

View File

@ -5,7 +5,6 @@ set -xu
remove_docker_container() {
docker rm -f tpu-test || true;
docker rm -f vllm-tpu || true;
}
trap remove_docker_container EXIT
@ -135,7 +134,7 @@ run_and_track_test 1 "test_compilation.py" \
run_and_track_test 2 "test_basic.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/v1/tpu/test_basic.py"
run_and_track_test 3 "test_accuracy.py::test_lm_eval_accuracy_v1_engine" \
"HF_HUB_DISABLE_XET=1 python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
"python3 -m pytest -s -v /workspace/vllm/tests/entrypoints/llm/test_accuracy.py::test_lm_eval_accuracy_v1_engine"
run_and_track_test 4 "test_quantization_accuracy.py" \
"python3 -m pytest -s -v /workspace/vllm/tests/tpu/test_quantization_accuracy.py"
run_and_track_test 5 "examples/offline_inference/tpu.py" \

View File

@ -1,6 +1,6 @@
# Environment config
TEST_NAME=llama8b
CONTAINER_NAME=vllm-tpu
CONTAINER_NAME=tpu-test
# vllm config
MODEL=meta-llama/Llama-3.1-8B-Instruct

View File

@ -12,8 +12,6 @@ source /etc/environment
source $ENV_FILE
remove_docker_container() {
docker rm -f tpu-test || true;
docker rm -f vllm-tpu || true;
docker rm -f $CONTAINER_NAME || true;
}

View File

@ -1,6 +1,6 @@
# Environment config
TEST_NAME=llama8bw8a8
CONTAINER_NAME=vllm-tpu
CONTAINER_NAME=tpu-test
# vllm config
MODEL=RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8

View File

@ -44,7 +44,6 @@ echo
VLLM_USE_V1=1 vllm serve $MODEL \
--seed 42 \
--disable-log-requests \
--max-num-seqs $MAX_NUM_SEQS \
--max-num-batched-tokens $MAX_NUM_BATCHED_TOKENS \
--tensor-parallel-size $TENSOR_PARALLEL_SIZE \

View File

@ -31,16 +31,6 @@
steps:
##### fast check tests #####
- label: Documentation Build # 2min
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/test_docs"
fast_check: true
no_gpu: True
commands:
- pip install -r ../requirements/docs.txt
# TODO: add `--strict` once warnings in docstrings are fixed
- mkdocs build
- label: Pytorch Nightly Dependency Override Check # 2min
# if this test fails, it means the nightly torch version is not compatible with some
# of the dependencies. Please check the error message and add the package to whitelist
@ -57,20 +47,20 @@ steps:
- vllm/
- tests/mq_llm_engine
- tests/async_engine
- tests/test_inputs
- tests/test_inputs.py
- tests/test_outputs.py
- tests/multimodal
- tests/test_utils
- tests/utils_
- tests/worker
- tests/standalone_tests/lazy_imports.py
commands:
- python3 standalone_tests/lazy_imports.py
- pytest -v -s mq_llm_engine # MQLLMEngine
- pytest -v -s async_engine # AsyncLLMEngine
- NUM_SCHEDULER_STEPS=4 pytest -v -s async_engine/test_async_llm_engine.py
- pytest -v -s test_inputs.py
- pytest -v -s test_outputs.py
- pytest -v -s multimodal
- pytest -v -s test_utils.py # Utils
- pytest -v -s utils_ # Utils
- pytest -v -s worker # Worker
- label: Python-only Installation Test
@ -82,7 +72,7 @@ steps:
- bash standalone_tests/python_only_compile.sh
- label: Basic Correctness Test # 30min
mirror_hardwares: [amdexperimental, amdproduction]
mirror_hardwares: [amdexperimental]
fast_check: true
torch_nightly: true
source_file_dependencies:
@ -99,7 +89,7 @@ steps:
- VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Chunked Prefill Test
mirror_hardwares: [amdexperimental, amdproduction]
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- tests/basic_correctness/test_chunked_prefill
@ -108,7 +98,7 @@ steps:
- VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
- label: Core Test # 10min
mirror_hardwares: [amdexperimental, amdproduction]
mirror_hardwares: [amdexperimental]
fast_check: true
source_file_dependencies:
- vllm/core
@ -128,11 +118,10 @@ steps:
- tests/entrypoints/offline_mode
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
- VLLM_USE_V1=0 pytest -v -s entrypoints/llm/test_guided_generate.py # it needs a clean process
- VLLM_USE_V1=0 pytest -v -s entrypoints/offline_mode # Needs to avoid interference with other tests
- label: Entrypoints Test (API Server) # 40min
@ -210,7 +199,7 @@ steps:
- pytest -v -s distributed/test_eplb_execute.py
- label: Metrics, Tracing Test # 10min
mirror_hardwares: [amdexperimental, amdproduction]
mirror_hardwares: [amdexperimental]
num_gpus: 2
source_file_dependencies:
- vllm/
@ -229,7 +218,7 @@ steps:
##### 1 GPU test #####
- label: Regression Test # 5min
mirror_hardwares: [amdexperimental, amdproduction]
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- tests/test_regression
@ -264,6 +253,7 @@ steps:
- pytest -v -s v1/engine
- pytest -v -s v1/entrypoints
- pytest -v -s v1/sample
- pytest -v -s v1/logits_processors
- pytest -v -s v1/worker
- pytest -v -s v1/structured_output
- pytest -v -s v1/spec_decode
@ -281,7 +271,7 @@ steps:
- pytest -v -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
- label: Examples Test # 25min
mirror_hardwares: [amdexperimental, amdproduction]
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/examples"
source_file_dependencies:
- vllm/entrypoints
@ -306,7 +296,7 @@ steps:
- VLLM_USE_V1=0 python3 offline_inference/profiling.py --model facebook/opt-125m run_num_steps --num-steps 2
- label: Prefix Caching Test # 9min
mirror_hardwares: [amdexperimental, amdproduction]
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- tests/prefix_caching
@ -315,7 +305,7 @@ steps:
- label: Platform Tests (CUDA)
mirror_hardwares: [amdexperimental, amdproduction]
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- tests/cuda
@ -354,9 +344,10 @@ steps:
- pytest -v -s compile/test_silu_mul_quant_fusion.py
- pytest -v -s compile/test_sequence_parallelism.py
- pytest -v -s compile/test_async_tp.py
- pytest -v -s compile/test_fusion_all_reduce.py
- label: PyTorch Fullgraph Smoke Test # 9min
mirror_hardwares: [amdexperimental, amdproduction]
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
@ -369,7 +360,7 @@ steps:
- pytest -v -s compile/piecewise/test_full_cudagraph.py
- label: PyTorch Fullgraph Test # 18min
mirror_hardwares: [amdexperimental, amdproduction]
mirror_hardwares: [amdexperimental]
torch_nightly: true
source_file_dependencies:
- vllm/
@ -378,7 +369,7 @@ steps:
- pytest -v -s compile/test_full_graph.py
- label: Kernels Core Operation Test
mirror_hardwares: [amdexperimental, amdproduction]
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/
- tests/kernels/core
@ -403,20 +394,22 @@ steps:
- vllm/model_executor/layers/quantization
- tests/kernels/quantization
commands:
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
- pytest -v -s kernels/quantization --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
- label: Kernels MoE Test
- label: Kernels MoE Test %N
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/quantization/cutlass_w8a8/moe/
- csrc/moe/
- tests/kernels/moe
- vllm/model_executor/layers/fused_moe/
commands:
- pytest -v -s kernels/moe
- pytest -v -s kernels/moe --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 2
- label: Kernels Mamba Test
mirror_hardwares: [amdexperimental, amdproduction]
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- csrc/mamba/
- tests/kernels/mamba
@ -424,8 +417,7 @@ steps:
- pytest -v -s kernels/mamba
- label: Tensorizer Test # 11min
mirror_hardwares: [amdexperimental, amdproduction]
soft_fail: true
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/model_executor/model_loader
- tests/tensorizer_loader
@ -437,7 +429,7 @@ steps:
- pytest -v -s entrypoints/openai/test_tensorizer_entrypoint.py
- label: Model Executor Test
mirror_hardwares: [amdexperimental, amdproduction]
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/model_executor
- tests/model_executor
@ -447,7 +439,7 @@ steps:
- pytest -v -s model_executor
- label: Benchmarks # 9min
mirror_hardwares: [amdexperimental, amdproduction]
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/.buildkite"
source_file_dependencies:
- benchmarks/
@ -455,7 +447,7 @@ steps:
- bash scripts/run-benchmarks.sh
- label: Benchmarks CLI Test # 10min
mirror_hardwares: [amdexperimental, amdproduction]
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- tests/benchmarks/
@ -494,7 +486,7 @@ steps:
- pytest -s entrypoints/openai/correctness/
- label: Encoder Decoder tests # 5min
mirror_hardwares: [amdexperimental, amdproduction]
mirror_hardwares: [amdexperimental]
source_file_dependencies:
- vllm/
- tests/encoder_decoder
@ -502,7 +494,7 @@ steps:
- pytest -v -s encoder_decoder
- label: OpenAI-Compatible Tool Use # 20 min
mirror_hardwares: [amdexperimental, amdproduction]
mirror_hardwares: [amdexperimental]
fast_check: false
source_file_dependencies:
- vllm/
@ -534,8 +526,6 @@ steps:
- vllm/
- tests/models/language
commands:
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
- pip freeze | grep -E 'torch'
- pytest -v -s models/language -m core_model
@ -546,8 +536,10 @@ steps:
- vllm/
- tests/models/language/generation
commands:
# Install causal-conv1d for plamo2 models here, as it is not compatible with pip-compile.
- pip install 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.0.post8'
# Install fast path packages for testing against transformers
# Note: also needed to run plamo2 model in vLLM
- uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5'
- uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2'
- pytest -v -s models/language/generation -m hybrid_model
- label: Language Models Test (Extended Generation) # 1hr20min
@ -580,7 +572,8 @@ steps:
- pip install git+https://github.com/TIGER-AI-Lab/Mantis.git
- pip freeze | grep -E 'torch'
- pytest -v -s models/multimodal/processing
- pytest -v -s --ignore models/multimodal/generation/test_whisper.py models/multimodal -m core_model
- pytest -v -s --ignore models/multimodal/generation/test_whisper.py --ignore models/multimodal/test_tensor_schema.py models/multimodal -m core_model
- pytest -v -s models/multimodal/test_tensor_schema.py -m core_model # Needs mp_method="spawn"
- cd .. && pytest -v -s tests/models/multimodal/generation/test_whisper.py -m core_model # Otherwise, mp_method="spawn" doesn't work
- label: Multi-Modal Models Test (Extended) 1
@ -623,7 +616,7 @@ steps:
# This test is used only in PR development phase to test individual models and should never run on main
- label: Custom Models Test
mirror_hardwares: [amdexperimental, amdproduction]
mirror_hardwares: [amdexperimental]
optional: true
commands:
- echo 'Testing custom models...'
@ -643,11 +636,41 @@ steps:
- python3 examples/offline_inference/audio_language.py --model-type whisper
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
- label: Blackwell Test
working_dir: "/vllm-workspace/"
gpu: b200
# optional: true
source_file_dependencies:
- csrc/quantization/fp4/
- csrc/attention/mla/
- csrc/quantization/cutlass_w8a8/moe/
- vllm/model_executor/layers/fused_moe/cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
- vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py
- vllm/v1/attention/backends/flashinfer.py
- vllm/compilation/fusion.py
commands:
- nvidia-smi
- python3 examples/offline_inference/basic/chat.py
# Attention
# num_heads2 broken by https://github.com/flashinfer-ai/flashinfer/issues/1353
- pytest -v -s tests/kernels/attention/test_flashinfer.py -k 'not num_heads2'
- pytest -v -s tests/kernels/attention/test_flashinfer_trtllm_attention.py
- pytest -v -s tests/kernels/test_cutlass_mla_decode.py
# Quantization
- pytest -v -s tests/kernels/quantization/test_cutlass_scaled_mm.py -k 'fp8'
- pytest -v -s tests/kernels/quantization/test_nvfp4_quant.py
- pytest -v -s tests/kernels/quantization/test_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py
- pytest -v -s tests/kernels/moe/test_nvfp4_moe.py
# Fusion
- pytest -v -s tests/compile/test_fusion_all_reduce.py
##### 1 GPU test #####
##### multi gpus test #####
- label: Distributed Comm Ops Test # 7min
mirror_hardwares: [amdexperimental, amdproduction]
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
@ -718,7 +741,6 @@ steps:
# this test fails consistently.
# TODO: investigate and fix
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- VLLM_USE_V1=0 CUDA_VISIBLE_DEVICES=0,1 pytest -v -s kv_transfer/test_disagg.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s v1/shutdown
- pytest -v -s models/multimodal/generation/test_maverick.py
@ -743,29 +765,8 @@ steps:
- pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s plugins/lora_resolvers # unit tests for in-tree lora resolver plugins
- label: Multi-step Tests (4 GPUs) # 36min
mirror_hardwares: [amdexperimental, amdproduction]
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
- vllm/model_executor/layers/sampler.py
- vllm/sequence.py
- vllm/worker/worker_base.py
- vllm/worker/worker.py
- vllm/worker/multi_step_worker.py
- vllm/worker/model_runner_base.py
- vllm/worker/model_runner.py
- vllm/worker/multi_step_model_runner.py
- vllm/engine
- tests/multi_step
commands:
# this test is quite flaky
# TODO: investigate and fix.
# - pytest -v -s multi_step/test_correctness_async_llm.py
- pytest -v -s multi_step/test_correctness_llm.py
- label: Pipeline Parallelism Test # 45min
mirror_hardwares: [amdexperimental, amdproduction]
mirror_hardwares: [amdexperimental]
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
@ -779,7 +780,7 @@ steps:
- pytest -v -s distributed/test_pipeline_parallel.py
- label: LoRA TP Test (Distributed)
mirror_hardwares: [amdexperimental, amdproduction]
mirror_hardwares: [amdexperimental]
num_gpus: 4
source_file_dependencies:
- vllm/lora
@ -792,6 +793,7 @@ steps:
# requires multi-GPU testing for validation.
- pytest -v -s -x lora/test_chatglm3_tp.py
- pytest -v -s -x lora/test_llama_tp.py
- pytest -v -s -x lora/test_multi_loras_with_tp.py
- label: Weight Loading Multiple GPU Test # 33min

26
.github/CODEOWNERS vendored
View File

@ -9,8 +9,7 @@
/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm
/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth @yewentao256
/vllm/multimodal @DarkLight1337 @ywang96
/vllm/vllm_flash_attn @LucasWilkinson
/vllm/lora @jeejeelee
@ -21,7 +20,7 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
# Any change to the VllmConfig changes can have a large user-facing impact,
# so spam a lot of people
/vllm/config.py @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor
/vllm/config @simon-mo @WoosukKwon @youkaichao @robertgshaw2-redhat @mgoin @tlrmchlsmth @houseroad @hmellor @yewentao256 @ProExpertProg
# vLLM V1
/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
@ -35,18 +34,15 @@ CMakeLists.txt @tlrmchlsmth @LucasWilkinson
/tests/distributed/test_pipeline_parallel.py @youkaichao
/tests/distributed/test_same_node.py @youkaichao
/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm
/tests/kernels @tlrmchlsmth @WoosukKwon
/tests/model_executor/test_guided_processors.py @mgoin @russellb
/tests/kernels @tlrmchlsmth @WoosukKwon @yewentao256
/tests/models @DarkLight1337 @ywang96
/tests/multi_step @alexm-redhat @comaniac
/tests/multimodal @DarkLight1337 @ywang96
/tests/prefix_caching @comaniac @KuntaiDu
/tests/quantization @mgoin @robertgshaw2-redhat
/tests/quantization @mgoin @robertgshaw2-redhat @yewentao256
/tests/test_inputs.py @DarkLight1337 @ywang96
/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
/tests/v1/structured_output @mgoin @russellb @aarnphm
/tests/weight_loading @mgoin @youkaichao
/tests/weight_loading @mgoin @youkaichao @yewentao256
/tests/lora @jeejeelee
# Docs
@ -64,3 +60,15 @@ mkdocs.yaml @hmellor
/vllm/v1/worker/^xpu @jikunshang
/vllm/platforms/xpu.py @jikunshang
/docker/Dockerfile.xpu @jikunshang
# Qwen-specific files
/vllm/attention/backends/dual_chunk_flash_attn.py @sighingnow
/vllm/model_executor/models/qwen* @sighingnow
# Mistral-specific files
/vllm/model_executor/models/mistral*.py @patrickvonplaten
/vllm/model_executor/models/mixtral*.py @patrickvonplaten
/vllm/model_executor/models/voxtral*.py @patrickvonplaten
/vllm/model_executor/models/pixtral*.py @patrickvonplaten
/vllm/transformers_utils/configs/mistral.py @patrickvonplaten
/vllm/transformers_utils/tokenizers/mistral.py @patrickvonplaten

View File

@ -1,10 +1,5 @@
## Essential Elements of an Effective PR Description Checklist
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
- [ ] The test plan, such as providing test command.
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
<!-- markdownlint-disable -->
PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED.
## Purpose
@ -14,5 +9,14 @@ PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE B
## (Optional) Documentation Update
<!--- pyml disable-next-line no-emphasis-as-heading -->
---
<details>
<summary> Essential Elements of an Effective PR Description Checklist </summary>
- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
- [ ] The test plan, such as providing test command.
- [ ] The test results, such as pasting the results comparison before and after, or e2e results
- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
</details>
**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)

17
.github/mergify.yml vendored
View File

@ -118,6 +118,20 @@ pull_request_rules:
add:
- qwen
- name: label-gpt-oss
description: Automatically apply gpt-oss label
conditions:
- or:
- files~=^examples/.*gpt[-_]?oss.*\.py
- files~=^tests/.*gpt[-_]?oss.*\.py
- files~=^vllm/model_executor/models/.*gpt[-_]?oss.*\.py
- files~=^vllm/model_executor/layers/.*gpt[-_]?oss.*\.py
- title~=(?i)gpt[-_]?oss
actions:
label:
add:
- gpt-oss
- name: label-rocm
description: Automatically apply rocm label
conditions:
@ -149,9 +163,6 @@ pull_request_rules:
- files=examples/offline_inference/structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs.py
- files=examples/online_serving/openai_chat_completion_structured_outputs_with_reasoning.py
- files~=^vllm/model_executor/guided_decoding/
- files=tests/model_executor/test_guided_processors.py
- files=tests/entrypoints/llm/test_guided_generate.py
- files~=^tests/v1/structured_output/
- files=tests/v1/entrypoints/llm/test_guided_generate.py
- files~=^vllm/v1/structured_output/

View File

@ -15,11 +15,11 @@ NEW=/tmp/new_pr_body.txt
gh pr view --json body --template "{{.body}}" "${PR_NUMBER}" > "${OLD}"
cp "${OLD}" "${NEW}"
# Remove "FIX #xxxx (*link existing issues this PR will resolve*)"
sed -i '/FIX #xxxx.*$/d' "${NEW}"
# Remove markdown comments (like the <!-- markdownlint-disable --> at the start)
sed -i '/<!--.*-->$/d' "${NEW}"
# Remove "FILL IN THE PR DESCRIPTION HERE"
sed -i '/FILL IN THE PR DESCRIPTION HERE/d' "${NEW}"
# Remove "PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS (AT THE BOTTOM) HAVE BEEN CONSIDERED."
sed -i '/PLEASE FILL IN THE PR DESCRIPTION HERE.*$/d' "${NEW}"
# Remove all lines after and including "**BEFORE SUBMITTING, PLEASE READ THE CHECKLIST BELOW AND FILL IN THE DESCRIPTION ABOVE**"
sed -i '/\*\*BEFORE SUBMITTING, PLEASE READ.*\*\*/,$d' "${NEW}"

View File

@ -2,12 +2,16 @@ name: Lint and Deploy Charts
on: pull_request
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
permissions:
contents: read
jobs:
lint-and-deploy:
runs-on: ubuntu-24.04-arm
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

View File

@ -0,0 +1,17 @@
{
"problemMatcher": [
{
"owner": "markdownlint",
"pattern": [
{
"regexp": "^([^:]*):(\\d+):?(\\d+)?\\s([\\w-\\/]*)\\s(.*)$",
"file": 1,
"line": 2,
"column": 3,
"code": 4,
"message": 5
}
]
}
]
}

View File

@ -5,6 +5,10 @@ on:
push:
branches: [main]
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
permissions:
contents: read
@ -17,6 +21,7 @@ jobs:
with:
python-version: "3.12"
- run: echo "::add-matcher::.github/workflows/matchers/actionlint.json"
- run: echo "::add-matcher::.github/workflows/matchers/markdownlint.json"
- run: echo "::add-matcher::.github/workflows/matchers/mypy.json"
- uses: pre-commit/action@2c7b3805fd2a0fd8c1884dcaebf91fc102a13ecd # v3.0.1
with:

View File

@ -15,7 +15,6 @@ $python_executable -m pip install -r requirements/build.txt -r requirements/cuda
export MAX_JOBS=1
# Make sure release wheels are built for the following architectures
export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
bash tools/check_repo.sh

9
.gitignore vendored
View File

@ -4,6 +4,9 @@
# vllm-flash-attn built from source
vllm/vllm_flash_attn/*
# triton jit
.triton
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
@ -147,7 +150,8 @@ venv.bak/
# mkdocs documentation
/site
docs/argparse
docs/examples
docs/examples/*
!docs/examples/README.md
# mypy
.mypy_cache/
@ -203,3 +207,6 @@ shellcheck*/
# Ignore moe/marlin_moe gen code
csrc/moe/marlin_moe_wna16/kernel_*
# Ignore ep_kernels_workspace folder
ep_kernels_workspace/

13
.markdownlint.yaml Normal file
View File

@ -0,0 +1,13 @@
MD007:
indent: 4
MD013: false
MD024:
siblings_only: true
MD033: false
MD042: false
MD045: false
MD046: false
MD051: false
MD052: false
MD053: false
MD059: false

View File

@ -35,12 +35,12 @@ repos:
exclude: 'csrc/(moe/topk_softmax_kernels.cu|quantization/gguf/(ggml-common.h|dequantize.cuh|vecdotq.cuh|mmq.cuh|mmvq.cuh))|vllm/third_party/.*'
types_or: [c++, cuda]
args: [--style=file, --verbose]
- repo: https://github.com/jackdewinter/pymarkdown
rev: v0.9.29
- repo: https://github.com/igorshubovych/markdownlint-cli
rev: v0.45.0
hooks:
- id: pymarkdown
- id: markdownlint
exclude: '.*\.inc\.md'
args: [fix]
stages: [manual] # Only run in CI
- repo: https://github.com/rhysd/actionlint
rev: v1.7.7
hooks:

View File

@ -7,6 +7,9 @@ build:
os: ubuntu-22.04
tools:
python: "3.12"
jobs:
post_checkout:
- git fetch --unshallow || true
mkdocs:
configuration: mkdocs.yaml

View File

@ -249,7 +249,6 @@ set(VLLM_EXT_SRC
"csrc/quantization/gguf/gguf_kernel.cu"
"csrc/quantization/activation_kernels.cu"
"csrc/cuda_utils_kernels.cu"
"csrc/prepare_inputs/advance_step.cu"
"csrc/custom_all_reduce.cu"
"csrc/torch_bindings.cpp")
@ -287,7 +286,6 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
FetchContent_MakeAvailable(cutlass)
list(APPEND VLLM_EXT_SRC
"csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu"
"csrc/permute_cols.cu"
"csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu"
@ -351,6 +349,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_TEMPLATE_KERNEL_SRC}"
CUDA_ARCHS "${MARLIN_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
set_source_files_properties(${MARLIN_TEMPLATE_KERNEL_SRC}
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
endif()
list(APPEND VLLM_EXT_SRC ${MARLIN_TEMPLATE_KERNEL_SRC})
@ -364,7 +366,12 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set_gencode_flags_for_srcs(
SRCS "${MARLIN_SRCS}"
CUDA_ARCHS "${MARLIN_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
set_source_files_properties("csrc/quantization/gptq_marlin/gptq_marlin.cu"
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
endif()
list(APPEND VLLM_EXT_SRC "${MARLIN_SRCS}")
message(STATUS "Building Marlin kernels for archs: ${MARLIN_ARCHS}")
else()
message(STATUS "Not building Marlin kernels as no compatible archs found"
@ -427,6 +434,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set(SRCS
"csrc/quantization/cutlass_w8a8/scaled_mm_c3x_sm120.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_sm120_fp8.cu"
"csrc/quantization/cutlass_w8a8/c3x/scaled_mm_blockwise_sm120_fp8.cu"
)
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
@ -529,6 +537,25 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
endif()
endif()
# The nvfp4_scaled_mm_sm120 kernels for Geforce Blackwell SM120 require
# CUDA 12.8 or later
cuda_archs_loose_intersection(FP4_ARCHS "12.0;12.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
set(SRCS
"csrc/quantization/fp4/nvfp4_quant_kernels.cu"
"csrc/quantization/fp4/nvfp4_scaled_mm_sm120_kernels.cu")
set_gencode_flags_for_srcs(
SRCS "${SRCS}"
CUDA_ARCHS "${FP4_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM120=1")
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
else()
message(STATUS "Not building NVFP4 as no compatible archs were found.")
# clear FP4_ARCHS
set(FP4_ARCHS)
endif()
# FP4 Archs and flags
cuda_archs_loose_intersection(FP4_ARCHS "10.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND FP4_ARCHS)
@ -541,7 +568,7 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
SRCS "${SRCS}"
CUDA_ARCHS "${FP4_ARCHS}")
list(APPEND VLLM_EXT_SRC "${SRCS}")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4=1")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_NVFP4_SM100=1")
list(APPEND VLLM_GPU_FLAGS "-DENABLE_CUTLASS_MOE_SM100=1")
message(STATUS "Building NVFP4 for archs: ${FP4_ARCHS}")
else()
@ -834,6 +861,10 @@ if(VLLM_GPU_LANG STREQUAL "CUDA")
set_gencode_flags_for_srcs(
SRCS "${MOE_WNAA16_MARLIN_SRC}"
CUDA_ARCHS "${MARLIN_MOE_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8)
set_source_files_properties(${MOE_WNAA16_MARLIN_SRC}
PROPERTIES COMPILE_FLAGS "-static-global-template-stub=false")
endif()
list(APPEND VLLM_MOE_EXT_SRC ${MOE_WNAA16_MARLIN_SRC})

View File

@ -1,3 +1,4 @@
<!-- markdownlint-disable MD001 MD041 -->
<p align="center">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/assets/logos/vllm-logo-text-dark.png">
@ -16,14 +17,16 @@ Easy, fast, and cheap LLM serving for everyone
---
*Latest News* 🔥
- [2025/08] We hosted [vLLM Beijing Meetup](https://mp.weixin.qq.com/s/dgkWg1WFpWGO2jCdTqQHxA) focusing on large-scale LLM deployment! Please find the meetup slides [here](https://drive.google.com/drive/folders/1Pid6NSFLU43DZRi0EaTcPgXsAzDvbBqF) and the recording [here](https://www.chaspark.com/#/live/1166916873711665152).
- [2025/05] We hosted [NYC vLLM Meetup](https://lu.ma/c1rqyf1f)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1_q_aW_ioMJWUImf1s1YM-ZhjXz8cUeL0IJvaquOYBeA/edit?usp=sharing).
- [2025/05] vLLM is now a hosted project under PyTorch Foundation! Please find the announcement [here](https://pytorch.org/blog/pytorch-foundation-welcomes-vllm/).
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
- [2025/01] We are excited to announce the alpha release of vLLM V1: A major architectural upgrade with 1.7x speedup! Clean code, optimized execution loop, zero-overhead prefix caching, enhanced multimodal support, and more. Please check out our blog post [here](https://blog.vllm.ai/2025/01/27/v1-alpha-release.html).
<details>
<summary>Previous News</summary>
- [2025/04] We hosted [Asia Developer Day](https://www.sginnovate.com/event/limited-availability-morning-evening-slots-remaining-inaugural-vllm-asia-developer-day)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/19cp6Qu8u48ihB91A064XfaXruNYiBOUKrBxAmDOllOo/edit?usp=sharing).
- [2025/03] We hosted [vLLM x Ollama Inference Night](https://lu.ma/vllm-ollama)! Please find the meetup slides from the vLLM team [here](https://docs.google.com/presentation/d/16T2PDD1YwRnZ4Tu8Q5r6n53c5Lr5c73UV9Vd2_eBo4U/edit?usp=sharing).
- [2025/03] We hosted [the first vLLM China Meetup](https://mp.weixin.qq.com/s/n77GibL2corAtQHtVEAzfg)! Please find the meetup slides from vLLM team [here](https://docs.google.com/presentation/d/1REHvfQMKGnvz6p3Fd23HhSO4c8j5WPGZV0bKYLwnHyQ/edit?usp=sharing).
- [2025/03] We hosted [the East Coast vLLM Meetup](https://lu.ma/7mu4k4xx)! Please find the meetup slides [here](https://docs.google.com/presentation/d/1NHiv8EUFF1NLd3fEYODm56nDmL26lEeXCaDgyDlTsRs/edit#slide=id.g31441846c39_0_0).
@ -46,6 +49,7 @@ Easy, fast, and cheap LLM serving for everyone
</details>
---
## About
vLLM is a fast and easy-to-use library for LLM inference and serving.
@ -75,6 +79,7 @@ vLLM is flexible and easy to use with:
- Multi-LoRA support
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
- Transformer-like LLMs (e.g., Llama)
- Mixture-of-Expert LLMs (e.g., Mixtral, Deepseek-V2 and V3)
- Embedding Models (e.g., E5-Mistral)
@ -91,6 +96,7 @@ pip install vllm
```
Visit our [documentation](https://docs.vllm.ai/en/latest/) to learn more.
- [Installation](https://docs.vllm.ai/en/latest/getting_started/installation.html)
- [Quickstart](https://docs.vllm.ai/en/latest/getting_started/quickstart.html)
- [List of Supported Models](https://docs.vllm.ai/en/latest/models/supported_models.html)
@ -107,6 +113,7 @@ vLLM is a community project. Our compute resources for development and testing a
<!-- Note: Please sort them in alphabetical order. -->
<!-- Note: Please keep these consistent with docs/community/sponsors.md -->
Cash Donations:
- a16z
- Dropbox
- Sequoia Capital
@ -114,6 +121,8 @@ Cash Donations:
- ZhenFund
Compute Resources:
- Alibaba Cloud
- AMD
- Anyscale
- AWS
@ -153,7 +162,7 @@ If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs
## Contact Us
<!-- --8<-- [start:contact-us] -->
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues) or [Discussions](https://github.com/vllm-project/vllm/discussions)
- For technical questions and feature requests, please use GitHub [Issues](https://github.com/vllm-project/vllm/issues)
- For discussing with fellow users, please use the [vLLM Forum](https://discuss.vllm.ai)
- For coordinating contributions and development, please use [Slack](https://slack.vllm.ai)
- For security disclosures, please use GitHub's [Security Advisories](https://github.com/vllm-project/vllm/security/advisories) feature

View File

@ -60,9 +60,10 @@ Please note: **No feature work allowed for cherry picks**. All PRs that are cons
Before each release, we perform end-to-end performance validation to ensure no regressions are introduced. This validation uses the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) on PyTorch CI.
**Current Coverage:**
* Models: Llama3, Llama4, and Mixtral
* Hardware: NVIDIA H100 and AMD MI300x
* *Note: Coverage may change based on new model releases and hardware availability*
* _Note: Coverage may change based on new model releases and hardware availability_
**Performance Validation Process:**
@ -71,11 +72,13 @@ Request write access to the [pytorch/pytorch-integration-testing](https://github
**Step 2: Review Benchmark Setup**
Familiarize yourself with the benchmark configurations:
* [CUDA setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/cuda)
* [ROCm setup](https://github.com/pytorch/pytorch-integration-testing/tree/main/vllm-benchmarks/benchmarks/rocm)
**Step 3: Run the Benchmark**
Navigate to the [vllm-benchmark workflow](https://github.com/pytorch/pytorch-integration-testing/actions/workflows/vllm-benchmark.yml) and configure:
* **vLLM branch**: Set to the release branch (e.g., `releases/v0.9.2`)
* **vLLM commit**: Set to the RC commit hash

View File

@ -1,13 +1,45 @@
# Security Policy
## Reporting a Vulnerability
## Reporting security issues
If you believe you have found a security vulnerability in vLLM, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem.
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new).
Please report security issues privately using [the vulnerability submission form](https://github.com/vllm-project/vllm/security/advisories/new). Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html).
## Issue triage
---
Reports will then be triaged by the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html).
## Threat model
Please see the [Security Guide in the vLLM documentation](https://docs.vllm.ai/en/latest/usage/security.html) for more information on vLLM's security assumptions and recommendations.
Please see [PyTorch's Security Policy](https://github.com/pytorch/pytorch/blob/main/SECURITY.md) for more information and recommendations on how to securely interact with models.
## Issue severity
We will determine the risk of each issue, taking into account our experience dealing with past issues, versions affected, common defaults, and use cases. We use the following severity categories:
### CRITICAL Severity
Vulnerabilities that allow remote attackers to execute arbitrary code, take full control of the system, or significantly compromise confidentiality, integrity, or availability without any interaction or privileges needed, examples include remote code execution via network, deserialization issues that allow exploit chains. Generally those issues which are rated as CVSS ≥9.0.
### HIGH Severity
Serious security flaws that allow elevated impact—like RCE in specific, limited contexts or significant data loss—but require advanced conditions or some trust, examples include RCE in advanced deployment modes (e.g. multi-node), or high impact issues where some sort of privileged network access is required. These issues typically have CVSS scores between 7.0 and 8.9
### MODERATE Severity
Vulnerabilities that cause denial of service or partial disruption, but do not allow arbitrary code execution or data breach and have limited impact. These issues have a CVSS rating between 4.0 and 6.9
### LOW Severity
Minor issues such as informational disclosures, logging errors, non-exploitable flaws, or weaknesses that require local or high-privilege access and offer negligible impact. Examples include side channel attacks or hash collisions. These issues often have CVSS scores less than 4.0
## Prenotification policy
For certain security issues of CRITICAL, HIGH, or MODERATE severity level, we may prenotify certain organizations or vendors that ship vLLM. The purpose of this prenotification is to allow for a coordinated release of fixes for severe issues.
* This prenotification will be in the form of a private email notification. It may also include adding security contacts to the GitHub security advisory, typically a few days before release.
* If you wish to be added to the prenotification group, please send an email copying all the members of the [vulnerability management team](https://docs.vllm.ai/en/latest/contributing/vulnerability_management.html). Each vendor contact will be analyzed on a case-by-case basis.
* We may withdraw organizations from receiving future prenotifications if they release fixes or any other information about issues before they are public. Group membership may also change based on policy refinements for who may be included.

View File

@ -4,7 +4,7 @@ This README guides you through running benchmark tests with the extensive
datasets supported on vLLM. Its a living document, updated as new features and datasets
become available.
**Dataset Overview**
## Dataset Overview
<table style="width:100%; border-collapse: collapse;">
<thead>
@ -22,6 +22,17 @@ become available.
<td style="text-align: center;"></td>
<td><code>wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json</code></td>
</tr>
<tr>
<td><strong>ShareGPT4V (Image)</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td>
<code>wget https://huggingface.co/datasets/Lin-Chen/ShareGPT4V/blob/main/sharegpt4v_instruct_gpt4-vision_cap100k.json</code>
<br>
<div>Note that the images need to be downloaded separately. For example, to download COCO's 2017 Train images:</div>
<code>wget http://images.cocodataset.org/zips/train2017.zip</code>
</td>
</tr>
<tr>
<td><strong>BurstGPT</strong></td>
<td style="text-align: center;"></td>
@ -29,7 +40,7 @@ become available.
<td><code>wget https://github.com/HPMLL/BurstGPT/releases/download/v1.1/BurstGPT_without_fails_2.csv</code></td>
</tr>
<tr>
<td><strong>Sonnet</strong></td>
<td><strong>Sonnet (deprecated)</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td>Local file: <code>benchmarks/sonnet.txt</code></td>
@ -40,6 +51,12 @@ become available.
<td style="text-align: center;"></td>
<td><code>synthetic</code></td>
</tr>
<tr>
<td><strong>Prefix Repetition</strong></td>
<td style="text-align: center;"></td>
<td style="text-align: center;"></td>
<td><code>synthetic</code></td>
</tr>
<tr>
<td><strong>HuggingFace-VisionArena</strong></td>
<td style="text-align: center;"></td>
@ -81,16 +98,17 @@ become available.
**Note**: HuggingFace dataset's `dataset-name` should be set to `hf`
---
## 🚀 Example - Online Benchmark
<details>
<summary><b>🚀 Example - Online Benchmark</b></summary>
<summary>Show more</summary>
<br/>
First start serving your model
```bash
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
vllm serve NousResearch/Hermes-3-Llama-3.1-8B
```
Then run the benchmarking script
@ -109,7 +127,7 @@ vllm bench serve \
If successful, you will see the following output
```
```text
============ Serving Benchmark Result ============
Successful requests: 10
Benchmark duration (s): 5.78
@ -133,11 +151,11 @@ P99 ITL (ms): 8.39
==================================================
```
**Custom Dataset**
### Custom Dataset
If the dataset you want to benchmark is not supported yet in vLLM, even then you can benchmark on it using `CustomDataset`. Your data needs to be in `.jsonl` format and needs to have "prompt" field per entry, e.g., data.jsonl
```
```json
{"prompt": "What is the capital of India?"}
{"prompt": "What is the capital of Iran?"}
{"prompt": "What is the capital of China?"}
@ -145,7 +163,7 @@ If the dataset you want to benchmark is not supported yet in vLLM, even then you
```bash
# start server
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct --disable-log-requests
VLLM_USE_V1=1 vllm serve meta-llama/Llama-3.1-8B-Instruct
```
```bash
@ -166,11 +184,11 @@ vllm bench serve --port 9001 --save-result --save-detailed \
You can skip applying chat template if your data already has it by using `--custom-skip-chat-template`.
**VisionArena Benchmark for Vision Language Models**
### VisionArena Benchmark for Vision Language Models
```bash
# need a model with vision capability here
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
vllm serve Qwen/Qwen2-VL-7B-Instruct
```
```bash
@ -184,7 +202,7 @@ vllm bench serve \
--num-prompts 1000
```
**InstructCoder Benchmark with Speculative Decoding**
### InstructCoder Benchmark with Speculative Decoding
``` bash
VLLM_USE_V1=1 vllm serve meta-llama/Meta-Llama-3-8B-Instruct \
@ -201,13 +219,13 @@ vllm bench serve \
--num-prompts 2048
```
**Other HuggingFaceDataset Examples**
### Other HuggingFaceDataset Examples
```bash
vllm serve Qwen/Qwen2-VL-7B-Instruct --disable-log-requests
vllm serve Qwen/Qwen2-VL-7B-Instruct
```
**`lmms-lab/LLaVA-OneVision-Data`**
`lmms-lab/LLaVA-OneVision-Data`:
```bash
vllm bench serve \
@ -221,7 +239,7 @@ vllm bench serve \
--num-prompts 10
```
**`Aeala/ShareGPT_Vicuna_unfiltered`**
`Aeala/ShareGPT_Vicuna_unfiltered`:
```bash
vllm bench serve \
@ -234,7 +252,7 @@ vllm bench serve \
--num-prompts 10
```
**`AI-MO/aimo-validation-aime`**
`AI-MO/aimo-validation-aime`:
``` bash
vllm bench serve \
@ -245,7 +263,7 @@ vllm bench serve \
--seed 42
```
**`philschmid/mt-bench`**
`philschmid/mt-bench`:
``` bash
vllm bench serve \
@ -255,7 +273,7 @@ vllm bench serve \
--num-prompts 80
```
**Running With Sampling Parameters**
### Running With Sampling Parameters
When using OpenAI-compatible backends such as `vllm`, optional sampling
parameters can be specified. Example client command:
@ -273,25 +291,29 @@ vllm bench serve \
--num-prompts 10
```
**Running With Ramp-Up Request Rate**
### Running With Ramp-Up Request Rate
The benchmark tool also supports ramping up the request rate over the
duration of the benchmark run. This can be useful for stress testing the
server or finding the maximum throughput that it can handle, given some latency budget.
Two ramp-up strategies are supported:
- `linear`: Increases the request rate linearly from a start value to an end value.
- `exponential`: Increases the request rate exponentially.
The following arguments can be used to control the ramp-up:
- `--ramp-up-strategy`: The ramp-up strategy to use (`linear` or `exponential`).
- `--ramp-up-start-rps`: The request rate at the beginning of the benchmark.
- `--ramp-up-end-rps`: The request rate at the end of the benchmark.
</details>
## 📈 Example - Offline Throughput Benchmark
<details>
<summary><b>📈 Example - Offline Throughput Benchmark</b></summary>
<summary>Show more</summary>
<br/>
@ -305,15 +327,15 @@ vllm bench throughput \
If successful, you will see the following output
```
```text
Throughput: 7.15 requests/s, 4656.00 total tokens/s, 1072.15 output tokens/s
Total num prompt tokens: 5014
Total num output tokens: 1500
```
**VisionArena Benchmark for Vision Language Models**
### VisionArena Benchmark for Vision Language Models
``` bash
```bash
vllm bench throughput \
--model Qwen/Qwen2-VL-7B-Instruct \
--backend vllm-chat \
@ -325,13 +347,13 @@ vllm bench throughput \
The `num prompt tokens` now includes image token counts
```
```text
Throughput: 2.55 requests/s, 4036.92 total tokens/s, 326.90 output tokens/s
Total num prompt tokens: 14527
Total num output tokens: 1280
```
**InstructCoder Benchmark with Speculative Decoding**
### InstructCoder Benchmark with Speculative Decoding
``` bash
VLLM_WORKER_MULTIPROC_METHOD=spawn \
@ -349,15 +371,15 @@ vllm bench throughput \
"prompt_lookup_min": 2}'
```
```
```text
Throughput: 104.77 requests/s, 23836.22 total tokens/s, 10477.10 output tokens/s
Total num prompt tokens: 261136
Total num output tokens: 204800
```
**Other HuggingFaceDataset Examples**
### Other HuggingFaceDataset Examples
**`lmms-lab/LLaVA-OneVision-Data`**
`lmms-lab/LLaVA-OneVision-Data`:
```bash
vllm bench throughput \
@ -370,7 +392,7 @@ vllm bench throughput \
--num-prompts 10
```
**`Aeala/ShareGPT_Vicuna_unfiltered`**
`Aeala/ShareGPT_Vicuna_unfiltered`:
```bash
vllm bench throughput \
@ -382,7 +404,7 @@ vllm bench throughput \
--num-prompts 10
```
**`AI-MO/aimo-validation-aime`**
`AI-MO/aimo-validation-aime`:
```bash
vllm bench throughput \
@ -394,7 +416,7 @@ vllm bench throughput \
--num-prompts 10
```
**Benchmark with LoRA Adapters**
Benchmark with LoRA adapters:
``` bash
# download dataset
@ -413,20 +435,22 @@ vllm bench throughput \
</details>
## 🛠️ Example - Structured Output Benchmark
<details>
<summary><b>🛠️ Example - Structured Output Benchmark</b></summary>
<summary>Show more</summary>
<br/>
Benchmark the performance of structured output generation (JSON, grammar, regex).
**Server Setup**
### Server Setup
```bash
vllm serve NousResearch/Hermes-3-Llama-3.1-8B --disable-log-requests
vllm serve NousResearch/Hermes-3-Llama-3.1-8B
```
**JSON Schema Benchmark**
### JSON Schema Benchmark
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
@ -438,7 +462,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
--num-prompts 1000
```
**Grammar-based Generation Benchmark**
### Grammar-based Generation Benchmark
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
@ -450,7 +474,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
--num-prompts 1000
```
**Regex-based Generation Benchmark**
### Regex-based Generation Benchmark
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
@ -461,7 +485,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
--num-prompts 1000
```
**Choice-based Generation Benchmark**
### Choice-based Generation Benchmark
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
@ -472,7 +496,7 @@ python3 benchmarks/benchmark_serving_structured_output.py \
--num-prompts 1000
```
**XGrammar Benchmark Dataset**
### XGrammar Benchmark Dataset
```bash
python3 benchmarks/benchmark_serving_structured_output.py \
@ -485,14 +509,16 @@ python3 benchmarks/benchmark_serving_structured_output.py \
</details>
## 📚 Example - Long Document QA Benchmark
<details>
<summary><b>📚 Example - Long Document QA Benchmark</b></summary>
<summary>Show more</summary>
<br/>
Benchmark the performance of long document question-answering with prefix caching.
**Basic Long Document QA Test**
### Basic Long Document QA Test
```bash
python3 benchmarks/benchmark_long_document_qa_throughput.py \
@ -504,7 +530,7 @@ python3 benchmarks/benchmark_long_document_qa_throughput.py \
--repeat-count 5
```
**Different Repeat Modes**
### Different Repeat Modes
```bash
# Random mode (default) - shuffle prompts randomly
@ -537,14 +563,16 @@ python3 benchmarks/benchmark_long_document_qa_throughput.py \
</details>
## 🗂️ Example - Prefix Caching Benchmark
<details>
<summary><b>🗂️ Example - Prefix Caching Benchmark</b></summary>
<summary>Show more</summary>
<br/>
Benchmark the efficiency of automatic prefix caching.
**Fixed Prompt with Prefix Caching**
### Fixed Prompt with Prefix Caching
```bash
python3 benchmarks/benchmark_prefix_caching.py \
@ -555,7 +583,7 @@ python3 benchmarks/benchmark_prefix_caching.py \
--input-length-range 128:256
```
**ShareGPT Dataset with Prefix Caching**
### ShareGPT Dataset with Prefix Caching
```bash
# download dataset
@ -570,16 +598,32 @@ python3 benchmarks/benchmark_prefix_caching.py \
--input-length-range 128:256
```
### Prefix Repetition Dataset
```bash
vllm bench serve \
--backend openai \
--model meta-llama/Llama-2-7b-chat-hf \
--dataset-name prefix_repetition \
--num-prompts 100 \
--prefix-repetition-prefix-len 512 \
--prefix-repetition-suffix-len 128 \
--prefix-repetition-num-prefixes 5 \
--prefix-repetition-output-len 128
```
</details>
## ⚡ Example - Request Prioritization Benchmark
<details>
<summary><b>⚡ Example - Request Prioritization Benchmark</b></summary>
<summary>Show more</summary>
<br/>
Benchmark the performance of request prioritization in vLLM.
**Basic Prioritization Test**
### Basic Prioritization Test
```bash
python3 benchmarks/benchmark_prioritization.py \
@ -590,7 +634,7 @@ python3 benchmarks/benchmark_prioritization.py \
--scheduling-policy priority
```
**Multiple Sequences per Prompt**
### Multiple Sequences per Prompt
```bash
python3 benchmarks/benchmark_prioritization.py \
@ -603,3 +647,41 @@ python3 benchmarks/benchmark_prioritization.py \
```
</details>
## 👁️ Example - Multi-Modal Benchmark
<details>
<summary>Show more</summary>
<br/>
Benchmark the performance of multi-modal requests in vLLM.
### Images (ShareGPT4V)
Start vLLM:
```bash
python -m vllm.entrypoints.openai.api_server \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--dtype bfloat16 \
--limit-mm-per-prompt '{"image": 1}' \
--allowed-local-media-path /path/to/sharegpt4v/images
```
Send requests with images:
```bash
python benchmarks/benchmark_serving.py \
--backend openai-chat \
--model Qwen/Qwen2.5-VL-7B-Instruct \
--dataset-name sharegpt \
--dataset-path /path/to/ShareGPT4V/sharegpt4v_instruct_gpt4-vision_cap100k.json \
--num-prompts 100 \
--save-result \
--result-dir ~/vllm_benchmark_results \
--save-detailed \
--endpoint /v1/chat/completion
```
</details>

View File

@ -3,6 +3,7 @@
This script automates the process of finding the optimal server parameter combination (`max-num-seqs` and `max-num-batched-tokens`) to maximize throughput for a vLLM server. It also supports additional constraints such as E2E latency and prefix cache hit rate.
## Table of Contents
- [Prerequisites](#prerequisites)
- [Configuration](#configuration)
- [How to Run](#how-to-run)
@ -52,7 +53,7 @@ You must set the following variables at the top of the script before execution.
1. **Configure**: Edit the script and set the variables in the [Configuration](#configuration) section.
2. **Execute**: Run the script. Since the process can take a long time, it is highly recommended to use a terminal multiplexer like `tmux` or `screen` to prevent the script from stopping if your connection is lost.
```
```bash
cd <FOLDER_OF_THIS_SCRIPT>
bash auto_tune.sh
```
@ -64,6 +65,7 @@ bash auto_tune.sh
Here are a few examples of how to configure the script for different goals:
### 1. Maximize Throughput (No Latency Constraint)
- **Goal**: Find the best `max-num-seqs` and `max-num-batched-tokens` to get the highest possible throughput for 1800 input tokens and 20 output tokens.
- **Configuration**:
@ -76,6 +78,7 @@ MAX_LATENCY_ALLOWED_MS=100000000000 # A very large number
```
#### 2. Maximize Throughput with a Latency Requirement
- **Goal**: Find the best server parameters when P99 end-to-end latency must be below 500ms.
- **Configuration**:
@ -88,6 +91,7 @@ MAX_LATENCY_ALLOWED_MS=500
```
#### 3. Maximize Throughput with Prefix Caching and Latency Requirements
- **Goal**: Find the best server parameters assuming a 60% prefix cache hit rate and a latency requirement of 500ms.
- **Configuration**:
@ -109,7 +113,7 @@ After the script finishes, you will find the results in a new, timestamped direc
- **Final Result Summary**: A file named `result.txt` is created in the log directory. It contains a summary of each tested combination and concludes with the overall best parameters found.
```
```text
# Example result.txt content
hash:a1b2c3d4...
max_num_seqs: 128, max_num_batched_tokens: 2048, request_rate: 10.0, e2el: 450.5, throughput: 9.8, goodput: 9.8

View File

@ -49,6 +49,7 @@ best_throughput=0
best_max_num_seqs=0
best_num_batched_tokens=0
best_goodput=0
best_request_rate=0
start_server() {
local gpu_memory_utilization=$1
@ -57,19 +58,35 @@ start_server() {
local vllm_log=$4
local profile_dir=$5
pkill -f vllm
pkill -if vllm
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir vllm serve $MODEL \
--disable-log-requests \
--port 8004 \
--gpu-memory-utilization $gpu_memory_utilization \
--max-num-seqs $max_num_seqs \
--max-num-batched-tokens $max_num_batched_tokens \
--tensor-parallel-size $TP \
--enable-prefix-caching \
--load-format dummy \
--download-dir "$DOWNLOAD_DIR" \
--max-model-len $MAX_MODEL_LEN > "$vllm_log" 2>&1 &
# Define the common arguments as a bash array.
# Each argument and its value are separate elements.
local common_args_array=(
"$MODEL"
"--disable-log-requests"
"--port" "8004"
"--gpu-memory-utilization" "$gpu_memory_utilization"
"--max-num-seqs" "$max_num_seqs"
"--max-num-batched-tokens" "$max_num_batched_tokens"
"--tensor-parallel-size" "$TP"
"--enable-prefix-caching"
"--load-format" "dummy"
"--download-dir" "$DOWNLOAD_DIR"
"--max-model-len" "$MAX_MODEL_LEN"
)
# Use the array expansion "${common_args_array[@]}"
# This correctly passes each element as a separate argument.
if [[ -n "$profile_dir" ]]; then
# Start server with profiling enabled
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 VLLM_TORCH_PROFILER_DIR=$profile_dir \
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
else
# Start server without profiling
VLLM_USE_V1=1 VLLM_SERVER_DEV_MODE=1 \
vllm serve "${common_args_array[@]}" > "$vllm_log" 2>&1 &
fi
# wait for 10 minutes...
server_started=0
@ -83,6 +100,7 @@ start_server() {
sleep 10
fi
done
if (( ! server_started )); then
echo "server did not start within 10 minutes. Please check server log at $vllm_log".
return 1
@ -91,37 +109,20 @@ start_server() {
fi
}
update_best_profile() {
local profile_dir=$1
local profile_index=$2
sorted_paths=($(find "$profile_dir" -maxdepth 1 -not -path "$profile_dir" | sort))
selected_profile_file=
if [[ "$SYSTEM" == "TPU" ]]; then
selected_profile_file="${sorted_paths[$profile_index]}/*.xplane.pb"
fi
if [[ "$SYSTEM" == "GPU" ]]; then
selected_profile_file="${sorted_paths[$profile_index]}"
fi
rm -f $PROFILE_PATH/*
cp $selected_profile_file $PROFILE_PATH
}
run_benchmark() {
local max_num_seqs=$1
local max_num_batched_tokens=$2
local gpu_memory_utilization=$3
echo "max_num_seq: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
local vllm_log="$LOG_FOLDER/vllm_log_${max_num_seqs}_${max_num_batched_tokens}.txt"
local profile_dir="$LOG_FOLDER/profile_${max_num_seqs}_${max_num_batched_tokens}"
echo "vllm_log: $vllm_log"
echo
rm -f $vllm_log
mkdir -p $profile_dir
pkill -f vllm
local profile_index=0
pkill -if vllm
echo "starting server..."
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log $profile_dir
# Call start_server without a profile_dir to avoid profiling overhead
start_server $gpu_memory_utilization $max_num_seqs $max_num_batched_tokens $vllm_log ""
result=$?
if [[ "$result" -eq 1 ]]; then
echo "server failed to start. gpu_memory_utilization:$gpu_memory_utilization, max_num_seqs:$max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens"
@ -135,7 +136,8 @@ run_benchmark() {
# get a basic qps by using request-rate inf
bm_log="$LOG_FOLDER/bm_log_${max_num_seqs}_${max_num_batched_tokens}_requestrate_inf.txt"
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
# --profile flag is removed from this call
vllm bench serve \
--backend vllm \
--model $MODEL \
@ -149,8 +151,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 1000 \
--random-prefix-len $prefix_len \
--port 8004 \
--profile &> "$bm_log"
--port 8004 &> "$bm_log"
throughput=$(grep "Request throughput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
e2el=$(grep "P99 E2EL (ms):" "$bm_log" | awk '{print $NF}')
goodput=$(grep "Request goodput (req/s):" "$bm_log" | sed 's/[^0-9.]//g')
@ -164,7 +165,6 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
# start from request-rate as int(throughput) + 1
request_rate=$((${throughput%.*} + 1))
while ((request_rate > 0)); do
profile_index=$((profile_index+1))
# clear prefix cache
curl -X POST http://0.0.0.0:8004/reset_prefix_cache
sleep 5
@ -202,12 +202,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
best_max_num_seqs=$max_num_seqs
best_num_batched_tokens=$max_num_batched_tokens
best_goodput=$goodput
if [[ "$SYSTEM" == "TPU" ]]; then
update_best_profile "$profile_dir/plugins/profile" $profile_index
fi
if [[ "$SYSTEM" == "GPU" ]]; then
update_best_profile "$profile_dir" $profile_index
fi
best_request_rate=$request_rate
fi
else
echo "max_num_seqs: $max_num_seqs, max_num_batched_tokens: $max_num_batched_tokens does not meet latency requirement ${MAX_LATENCY_ALLOWED_MS}"
@ -216,7 +211,7 @@ adjusted_input_len=$(( INPUT_LEN - prefix_len ))
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput"
pkill vllm
pkill -if vllm
sleep 10
printf '=%.0s' $(seq 1 20)
return 0
@ -229,7 +224,8 @@ read -r -a num_batched_tokens_list <<< "$NUM_BATCHED_TOKENS_LIST"
gpu_memory_utilization=0.98
find_gpu_memory_utilization=0
while (( $(echo "$gpu_memory_utilization >= 0.9" | bc -l) )); do
start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log"
# Pass empty string for profile_dir argument
start_server $gpu_memory_utilization "${num_seqs_list[-1]}" "${num_batched_tokens_list[-1]}" "$LOG_FOLDER/vllm_log_gpu_memory_utilization_$gpu_memory_utilization.log" ""
result=$?
if [[ "$result" -eq 0 ]]; then
find_gpu_memory_utilization=1
@ -252,5 +248,45 @@ for num_seqs in "${num_seqs_list[@]}"; do
done
done
echo "finish permutations"
# =================================================================================
# FINAL PROFILING RUN FOR THE BEST CONFIGURATION
# =================================================================================
if (( $(echo "$best_throughput > 0" | bc -l) )); then
echo
echo "Benchmark tuning finished. Now running profiling on the best configuration found..."
echo "Best config: max_num_seqs: $best_max_num_seqs, max_num_batched_tokens: $best_num_batched_tokens, throughput: $best_throughput"
echo
vllm_log="$LOG_FOLDER/vllm_log_BEST_PROFILE.txt"
bm_log="$LOG_FOLDER/bm_log_BEST_PROFILE.txt"
# Start server with the best params and profiling ENABLED
echo "Starting server for profiling..."
start_server $gpu_memory_utilization $best_max_num_seqs $best_num_batched_tokens "$vllm_log" "$PROFILE_PATH"
# Run benchmark with the best params and the --profile flag
echo "Running benchmark with profiling..."
prefix_len=$(( INPUT_LEN * MIN_CACHE_HIT_PCT / 100 ))
adjusted_input_len=$(( INPUT_LEN - prefix_len ))
vllm bench serve \
--backend vllm \
--model $MODEL \
--dataset-name random \
--random-input-len $adjusted_input_len \
--random-output-len $OUTPUT_LEN \
--ignore-eos \
--disable-tqdm \
--request-rate $best_request_rate \
--percentile-metrics ttft,tpot,itl,e2el \
--goodput e2el:$MAX_LATENCY_ALLOWED_MS \
--num-prompts 100 \
--random-prefix-len $prefix_len \
--port 8004 \
--profile &> "$bm_log"
else
echo "No configuration met the latency requirements. Skipping final profiling run."
fi
pkill -if vllm
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH"
echo "best_max_num_seqs: $best_max_num_seqs, best_num_batched_tokens: $best_num_batched_tokens, best_throughput: $best_throughput, profile saved in: $PROFILE_PATH" >> "$RESULT"

View File

@ -31,7 +31,7 @@ class RequestFuncInput:
model_name: Optional[str] = None
logprobs: Optional[int] = None
extra_body: Optional[dict] = None
multi_modal_content: Optional[dict] = None
multi_modal_content: Optional[dict | list[dict]] = None
ignore_eos: bool = False
language: Optional[str] = None
@ -364,7 +364,15 @@ async def async_request_openai_chat_completions(
) as session:
content = [{"type": "text", "text": request_func_input.prompt}]
if request_func_input.multi_modal_content:
content.append(request_func_input.multi_modal_content)
mm_content = request_func_input.multi_modal_content
if isinstance(mm_content, list):
content.extend(mm_content)
elif isinstance(mm_content, dict):
content.append(mm_content)
else:
raise TypeError(
"multi_modal_content must be a dict or list[dict] for openai-chat"
)
payload = {
"model": request_func_input.model_name
if request_func_input.model_name
@ -491,7 +499,10 @@ async def async_request_openai_audio(
buffer.seek(0)
return buffer
with to_bytes(*request_func_input.multi_modal_content["audio"]) as f:
mm_audio = request_func_input.multi_modal_content
if not isinstance(mm_audio, dict) or "audio" not in mm_audio:
raise TypeError("multi_modal_content must be a dict containing 'audio'")
with to_bytes(*mm_audio["audio"]) as f:
form = aiohttp.FormData()
form.add_field("file", f, content_type="audio/wav")
for key, value in payload.items():

View File

@ -0,0 +1,74 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc
from tabulate import tabulate
from benchmark_utils import TimeCollector
from vllm.utils import FlexibleArgumentParser
from vllm.v1.core.block_pool import BlockPool
def main(args):
rows = []
for allocate_block in args.allocate_blocks:
# Enforce a GC collect ahead to minimize the impact among runs
gc.collect()
block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)
get_blocks_times = TimeCollector(TimeCollector.US)
free_blocks_times = TimeCollector(TimeCollector.US)
for _ in range(args.num_iteration):
with get_blocks_times:
blocks = block_pool.get_new_blocks(allocate_block)
with free_blocks_times:
block_pool.free_blocks(blocks)
rows.append(
[get_blocks_times.cnt, args.num_gpu_blocks, allocate_block]
+ get_blocks_times.dump_avg_max()
+ free_blocks_times.dump_avg_max()
)
print(
tabulate(
rows,
headers=[
"Iterations",
"Total\nBlocks",
"Allocated\nBlocks",
"Get Blocks\nAvg (us)",
"Get Blocks\nMax (us)",
"Free Blocks\nAvg (us)",
"Free Blocks\nMax (us)",
],
tablefmt="grid",
floatfmt=".3f",
)
)
def invoke_main() -> None:
parser = FlexibleArgumentParser(
description="Benchmark the performance of BlockPool for KV Cache."
)
parser.add_argument("--num-gpu-blocks", type=int, default=100000)
parser.add_argument(
"--num-iteration",
type=int,
default=1000,
help="Number of iterations to run to stablize final data readings",
)
parser.add_argument(
"--allocate-blocks",
type=int,
nargs="*",
default=[10, 50, 100, 500, 1000],
help="Number of blocks to allocate",
)
args = parser.parse_args()
main(args)
if __name__ == "__main__":
invoke_main() # pragma: no cover

View File

@ -52,7 +52,7 @@ class SampleRequest:
prompt: Union[str, Any]
prompt_len: int
expected_output_len: int
multi_modal_data: Optional[Union[MultiModalDataDict, dict]] = None
multi_modal_data: Optional[Union[MultiModalDataDict, dict, list[dict]]] = None
lora_request: Optional[LoRARequest] = None
@ -430,14 +430,20 @@ class ShareGPTDataset(BenchmarkDataset):
skip_min_output_len_check=output_len is not None,
):
continue
# TODO: Also support ShareGPT4Video.
if image_path := entry.get("image"):
mm_content = process_image(image_path)
else:
mm_content = None
if enable_multimodal_chat:
prompt = self.apply_multimodal_chat_transformation(prompt, None)
prompt = self.apply_multimodal_chat_transformation(prompt, mm_content)
samples.append(
SampleRequest(
prompt=prompt,
prompt_len=prompt_len,
expected_output_len=new_output_len,
lora_request=lora_request,
multi_modal_data=mm_content,
)
)
self.maybe_oversample_requests(samples, num_requests)

View File

@ -0,0 +1,112 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc
import numpy as np
from tabulate import tabulate
from benchmark_utils import TimeCollector
from vllm.config import ModelConfig, SpeculativeConfig, VllmConfig
from vllm.utils import FlexibleArgumentParser
from vllm.v1.spec_decode.ngram_proposer import NgramProposer
def main(args):
rows = []
for max_ngram in args.max_ngram:
collector = TimeCollector(TimeCollector.US)
model_config = ModelConfig(
model="facebook/opt-125m",
task="generate",
max_model_len=args.num_token + args.num_spec_token,
tokenizer="facebook/opt-125m",
tokenizer_mode="auto",
dtype="auto",
seed=None,
trust_remote_code=False,
)
proposer = NgramProposer(
vllm_config=VllmConfig(
model_config=model_config,
speculative_config=SpeculativeConfig(
prompt_lookup_min=args.min_ngram,
prompt_lookup_max=max_ngram,
num_speculative_tokens=args.num_spec_token,
method="ngram",
),
)
)
# Warm up
proposer.propose(np.random.randint(0, 20, (args.num_token,)))
gc.collect()
for _ in range(args.num_iteration):
tokens = np.random.randint(0, 20, (args.num_req, args.num_token))
with collector:
for i in range(args.num_req):
proposer.propose(tokens[i, :])
rows.append(
[args.num_req, args.num_token, args.min_ngram, max_ngram]
+ collector.dump_avg_max()
)
print(
tabulate(
rows,
headers=[
"# Request",
"# Token",
"Min Ngram",
"Max Ngram",
"Avg (us)",
"Max (us)",
],
tablefmt="grid",
floatfmt=".3f",
)
)
def invoke_main() -> None:
parser = FlexibleArgumentParser(
description="Benchmark the performance of N-gram speculative decode drafting"
)
parser.add_argument(
"--num-iteration",
type=int,
default=100,
help="Number of iterations to run to stablize final data readings",
)
parser.add_argument(
"--num-req", type=int, default=128, help="Number of requests in the batch"
)
parser.add_argument(
"--num-token", type=int, default=1500, help="Number of tokens for each request"
)
parser.add_argument(
"--min-ngram",
type=int,
default=3,
help="Minimum n-gram to match",
)
parser.add_argument(
"--max-ngram",
type=int,
nargs="*",
default=[5, 7, 10, 15, 20],
help="Maximum n-gram to match",
)
parser.add_argument(
"--num-spec-token",
type=int,
default=3,
help="Number of speculative tokens to generate",
)
args = parser.parse_args()
main(args)
if __name__ == "__main__":
invoke_main() # pragma: no cover

View File

@ -5,8 +5,7 @@ r"""Benchmark online serving throughput.
On the server side, run one of the following commands:
vLLM OpenAI API server
vllm serve <your_model> \
--swap-space 16 \
--disable-log-requests
--swap-space 16
On the client side, run:
python benchmarks/benchmark_serving.py \
@ -264,7 +263,14 @@ async def benchmark(
input_requests[0].multi_modal_data,
)
assert test_mm_content is None or isinstance(test_mm_content, dict)
assert (
test_mm_content is None
or isinstance(test_mm_content, dict)
or (
isinstance(test_mm_content, list)
and all(isinstance(item, dict) for item in test_mm_content)
)
), "multi_modal_data must be a dict or list[dict]"
test_input = RequestFuncInput(
model=model_id,
model_name=model_name,
@ -396,20 +402,6 @@ async def benchmark(
tasks.append(asyncio.create_task(task))
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
if profile:
print("Stopping profiler...")
profile_input = RequestFuncInput(
model=model_id,
prompt=test_prompt,
api_url=base_url + "/stop_profile",
prompt_len=test_prompt_len,
output_len=test_output_len,
logprobs=logprobs,
)
profile_output = await request_func(request_func_input=profile_input)
if profile_output.success:
print("Profiler stopped")
if pbar is not None:
pbar.close()
@ -427,6 +419,10 @@ async def benchmark(
print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
if max_concurrency is not None:
print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency))
if request_rate != float("inf"):
print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate))
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
@ -518,6 +514,20 @@ async def benchmark(
print("=" * 50)
if profile:
print("Stopping profiler...")
profile_input = RequestFuncInput(
model=model_id,
prompt=test_prompt,
api_url=base_url + "/stop_profile",
prompt_len=test_prompt_len,
output_len=test_output_len,
logprobs=logprobs,
)
profile_output = await request_func(request_func_input=profile_input)
if profile_output.success:
print("Profiler stopped")
return result

View File

@ -4,7 +4,7 @@ r"""Benchmark online serving throughput with structured outputs.
On the server side, run one of the following commands:
(vLLM OpenAI API server)
vllm serve <your_model> --disable-log-requests
vllm serve <your_model>
On the client side, run:
python benchmarks/benchmark_serving_structured_output.py \
@ -538,20 +538,6 @@ async def benchmark(
)
outputs: list[RequestFuncOutput] = await asyncio.gather(*tasks)
if profile:
print("Stopping profiler...")
profile_input = RequestFuncInput(
model=model_id,
prompt=test_request.prompt,
api_url=base_url + "/stop_profile",
prompt_len=test_request.prompt_len,
output_len=test_request.expected_output_len,
extra_body={test_request.structure_type: test_request.schema},
)
profile_output = await request_func(request_func_input=profile_input)
if profile_output.success:
print("Profiler stopped")
if pbar is not None:
pbar.close()
@ -569,6 +555,10 @@ async def benchmark(
print("{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
if max_concurrency is not None:
print("{:<40} {:<10}".format("Maximum request concurrency:", max_concurrency))
if request_rate != float("inf"):
print("{:<40} {:<10.2f}".format("Request rate configured (RPS):", request_rate))
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
@ -666,6 +656,20 @@ async def benchmark(
print("=" * 50)
if profile:
print("Stopping profiler...")
profile_input = RequestFuncInput(
model=model_id,
prompt=test_request.prompt,
api_url=base_url + "/stop_profile",
prompt_len=test_request.prompt_len,
output_len=test_request.expected_output_len,
extra_body={test_request.structure_type: test_request.schema},
)
profile_output = await request_func(request_func_input=profile_input)
if profile_output.success:
print("Profiler stopped")
return result, ret

View File

@ -1,11 +1,12 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import json
import math
import os
from typing import Any
import time
from types import TracebackType
from typing import Any, Optional, Union
def convert_to_pytorch_benchmark_format(
@ -72,3 +73,53 @@ def write_to_json(filename: str, records: list) -> None:
cls=InfEncoder,
default=lambda o: f"<{type(o).__name__} object is not JSON serializable>",
)
# Collect time and generate time metrics
#
# Example Usage:
# collector = TimeCollector(TimeCollector.US)
# for _ in range(total_iteration):
# with collector:
# ...
# collector.dump_avg_max()
class TimeCollector:
NS: int = 1
US: int = NS * 1000
MS: int = US * 1000
S: int = MS * 1000
def __init__(self, scale: int) -> None:
self.cnt: int = 0
self._sum: int = 0
self._max: Optional[int] = None
self.scale = scale
self.start_time: int = time.monotonic_ns()
def collect(self, v: int) -> None:
self.cnt += 1
self._sum += v
if self._max is None:
self._max = v
else:
self._max = max(self._max, v)
def avg(self) -> Union[float, str]:
return self._sum * 1.0 / self.cnt / self.scale if self.cnt > 0 else "N/A"
def max(self) -> Union[float, str]:
return self._max / self.scale if self._max else "N/A"
def dump_avg_max(self) -> list[Union[float, str]]:
return [self.avg(), self.max()]
def __enter__(self) -> None:
self.start_time = time.monotonic_ns()
def __exit__(
self,
exc_type: Optional[type[BaseException]],
exc_value: Optional[BaseException],
exc_traceback: Optional[TracebackType],
) -> None:
self.collect(time.monotonic_ns() - self.start_time)

View File

@ -1,63 +1,199 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import asyncio
import logging
import os
import aiohttp
from quart import Quart, make_response, request
from quart import Quart, Response, make_response, request
from rate_limiter import RateLimiter
from request_queue import RequestQueue
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
app = Quart(__name__)
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
async def forward_request(url, data):
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
def parse_args():
"""parse command line arguments"""
parser = argparse.ArgumentParser(description="vLLM P/D disaggregation proxy server")
# Add args
parser.add_argument(
"--timeout",
type=float,
default=300,
help="Timeout for backend service requests in seconds (default: 300)",
)
parser.add_argument(
"--max-concurrent",
type=int,
default=100,
help="Maximum concurrent requests to backend services (default: 100)",
)
parser.add_argument(
"--queue-size",
type=int,
default=500,
help="Maximum number of requests in the queue (default: 500)",
)
parser.add_argument(
"--rate-limit",
type=int,
default=40,
help="Maximum requests per second (default: 40)",
)
parser.add_argument(
"--port",
type=int,
default=8000,
help="Port to run the server on (default: 8000)",
)
parser.add_argument(
"--prefill-url",
type=str,
default="http://localhost:8100/v1/completions",
help="Prefill service endpoint URL",
)
parser.add_argument(
"--decode-url",
type=str,
default="http://localhost:8200/v1/completions",
help="Decode service endpoint URL",
)
return parser.parse_args()
def main():
"""parse command line arguments"""
args = parse_args()
# Initialize configuration using command line parameters
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=args.timeout)
MAX_CONCURRENT_REQUESTS = args.max_concurrent
REQUEST_QUEUE_SIZE = args.queue_size
RATE_LIMIT = args.rate_limit
PREFILL_SERVICE_URL = args.prefill_url
DECODE_SERVICE_URL = args.decode_url
PORT = args.port
app = Quart(__name__)
# Initialize the rate limiter and request queue
rate_limiter = RateLimiter(RATE_LIMIT)
request_queue = RequestQueue(MAX_CONCURRENT_REQUESTS, REQUEST_QUEUE_SIZE)
# Attach the configuration object to the application instance
app.config.update(
{
"AIOHTTP_TIMEOUT": AIOHTTP_TIMEOUT,
"rate_limiter": rate_limiter,
"request_queue": request_queue,
"PREFILL_SERVICE_URL": PREFILL_SERVICE_URL,
"DECODE_SERVICE_URL": DECODE_SERVICE_URL,
}
)
# Start queue processing on app startup
@app.before_serving
async def startup():
"""Start request processing task when app starts serving"""
asyncio.create_task(request_queue.process())
async def forward_request(url, data):
"""Forward request to backend service with rate limiting and error handling"""
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
async with session.post(url=url, json=data, headers=headers) as response:
if response.status == 200:
# if response.headers.get('Transfer-Encoding') == 'chunked':
if True:
async for chunk_bytes in response.content.iter_chunked(1024):
yield chunk_bytes
else:
content = await response.read()
yield content
@app.route("/v1/completions", methods=["POST"])
async def handle_request():
try:
original_request_data = await request.get_json()
prefill_request = original_request_data.copy()
# change max_tokens = 1 to let it only do prefill
prefill_request["max_tokens"] = 1
# finish prefill
async for _ in forward_request(
"http://localhost:8100/v1/completions", prefill_request
# Use rate limiter as context manager
async with (
rate_limiter,
aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session,
):
continue
try:
async with session.post(
url=url, json=data, headers=headers
) as response:
if response.status == 200:
# Stream response chunks
async for chunk_bytes in response.content.iter_chunked(1024):
yield chunk_bytes
else:
# Handle backend service errors
error_text = await response.text()
logger.error(
"Backend service error: %s - %s",
response.status,
error_text,
)
yield b'{"error": "Backend service error"}'
except aiohttp.ClientError as e:
# Handle connection errors
logger.error("Connection error to %s: %s", url, str(e))
yield b'{"error": "Service unavailable"}'
except asyncio.TimeoutError:
# Handle timeout errors
logger.error("Timeout connecting to %s", url)
yield b'{"error": "Service timeout"}'
# return decode
generator = forward_request(
"http://localhost:8200/v1/completions", original_request_data
)
response = await make_response(generator)
response.timeout = None
async def process_request():
"""Process a single request through prefill and decode stages"""
try:
original_request_data = await request.get_json()
return response
# Create prefill request (max_tokens=1)
prefill_request = original_request_data.copy()
prefill_request["max_tokens"] = 1
except Exception as e:
import sys
import traceback
# Execute prefill stage
async for _ in forward_request(PREFILL_SERVICE_URL, prefill_request):
continue
exc_info = sys.exc_info()
print("Error occurred in disagg prefill proxy server")
print(e)
print("".join(traceback.format_exception(*exc_info)))
# Execute decode stage and stream response
generator = forward_request(DECODE_SERVICE_URL, original_request_data)
response = await make_response(generator)
response.timeout = None # Disable timeout for streaming response
return response
except Exception:
logger.exception("Error processing request")
return Response(
response=b'{"error": "Internal server error"}',
status=500,
content_type="application/json",
)
@app.route("/v1/completions", methods=["POST"])
async def handle_request():
"""Handle incoming API requests with concurrency and rate limiting"""
# Create task for request processing
task = asyncio.create_task(process_request())
# Enqueue request or reject if queue is full
if not await request_queue.enqueue(task):
return Response(
response=b'{"error": "Server busy, try again later"}',
status=503,
content_type="application/json",
)
try:
# Return the response from the processing task
return await task
except asyncio.CancelledError:
# Handle task cancellation (timeout or queue full)
logger.warning("Request cancelled due to timeout or queue full")
return Response(
response=b'{"error": "Request cancelled"}',
status=503,
content_type="application/json",
)
# Start the Quart server with host can be set to 0.0.0.0
app.run(port=PORT)
if __name__ == "__main__":
app.run(port=8000)
main()

View File

@ -0,0 +1,45 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import time
class RateLimiter:
"""Token bucket rate limiter implementation"""
def __init__(self, rate_limit):
self.rate_limit = rate_limit # Requests per second
self.num_available_tokens = rate_limit # Available tokens
self.last_refill = time.monotonic() # Last token refill time
self.lock = asyncio.Lock() # Synchronization lock
async def acquire(self):
"""Acquire a token from the rate limiter"""
while True:
async with self.lock:
current_time = time.monotonic()
elapsed = current_time - self.last_refill
# Refill num_available_tokens if more than 1 second has passed
if elapsed > 1.0:
self.num_available_tokens = self.rate_limit
self.last_refill = current_time
# Check if num_available_tokens are available
if self.num_available_tokens > 0:
self.num_available_tokens -= 1
return True
# Calculate wait time if no num_available_tokens available
wait_time = 1.0 - elapsed
await asyncio.sleep(wait_time)
async def __aenter__(self):
"""Enter async context manager - acquire token"""
await self.acquire()
return self
async def __aexit__(self, exc_type, exc_value, traceback):
"""Exit async context manager - no cleanup needed"""
pass

View File

@ -0,0 +1,39 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
from collections import deque
class RequestQueue:
"""Request queue manager with concurrency control"""
def __init__(self, max_concurrent, max_queue_size):
# Maximum concurrent requests
self.max_concurrent = max_concurrent
self.max_queue_size = max_queue_size # Maximum queue size
# Concurrency control
self.semaphore = asyncio.Semaphore(max_concurrent)
self.queue = deque() # Request queue
self.queue_size = 0 # Current queue size
self.lock = asyncio.Lock() # Sync queue Lock
async def enqueue(self, task):
"""Add a request task to the queue"""
async with self.lock:
if self.queue_size >= self.max_queue_size:
return False
self.queue.append(task)
self.queue_size += 1
return True
async def process(self):
"""Process queued requests using semaphore for concurrency control"""
while True:
if self.queue:
async with self.semaphore, self.lock:
task = self.queue.popleft()
self.queue_size -= 1
await task
await asyncio.sleep(0.01) # Yield control to event loop

View File

@ -1,345 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import os
import sys
from typing import Optional
import torch
import torch.nn.functional as F
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.aqlm import (
dequantize_weight,
generic_dequantize_gemm,
get_int_dtype,
optimized_dequantize_gemm,
)
from vllm.utils import FlexibleArgumentParser
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
def torch_mult(
# [..., in_features]
input: torch.Tensor,
weights: torch.Tensor,
# [num_out_groups, 1, 1, 1]
scales: torch.Tensor,
) -> torch.Tensor:
output = F.linear(input, weights)
return output
def dequant_out_scale(
# [..., in_features]
input: torch.Tensor,
# [num_out_groups, num_in_groups, num_codebooks]
codes: torch.IntTensor,
# [num_codebooks, codebook_size, out_group_size, in_group_size]
codebooks: torch.Tensor,
# [num_out_groups, 1, 1, 1]
scales: torch.Tensor,
output_partition_sizes: torch.IntTensor,
bias: Optional[torch.Tensor],
) -> torch.Tensor:
weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
if bias is None:
output = F.linear(input, weights, bias)
orig_shape = output.shape
flattened_output = output.view(-1, output.size(-1))
f_scales = scales.view(-1, scales.shape[0])
b_scales = f_scales.expand(flattened_output.shape[0], -1)
flattened_output *= b_scales
return flattened_output.view(orig_shape)
else:
b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1])
weights *= b_scales
return F.linear(input, weights, bias)
def dequant_weight_scale(
# [..., in_features]
input: torch.Tensor,
# [num_out_groups, num_in_groups, num_codebooks]
codes: torch.IntTensor,
# [num_codebooks, codebook_size, out_group_size, in_group_size]
codebooks: torch.Tensor,
# [num_out_groups, 1, 1, 1]
scales: torch.Tensor,
output_partition_sizes: torch.IntTensor,
bias: Optional[torch.Tensor],
) -> torch.Tensor:
weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
b_scales = scales.view(scales.shape[:-3] + (-1,)).expand(-1, weights.shape[1])
weights *= b_scales
return F.linear(input, weights, bias)
def dequant_no_scale(
# [..., in_features]
input: torch.Tensor,
# [num_out_groups, num_in_groups, num_codebooks]
codes: torch.IntTensor,
# [num_codebooks, codebook_size, out_group_size, in_group_size]
codebooks: torch.Tensor,
# [num_out_groups, 1, 1, 1]
scales: torch.Tensor,
output_partition_sizes: torch.IntTensor,
bias: Optional[torch.Tensor],
) -> torch.Tensor:
weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
return F.linear(input, weights, bias)
# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
# the generic pytorch version.
# Just visual comparison.
def dequant_test(k: int, parts: torch.Tensor, nbooks: int, bits: int) -> None:
n = int(parts.sum().item())
device = torch.device("cuda:0")
code_range = (1 << bits) // 2
ingroups = 8
codes = torch.randint(
-code_range,
code_range,
size=(n, k // ingroups, nbooks),
dtype=get_int_dtype(bits),
device=device,
)
codebooks = torch.randn(
size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
dtype=torch.float16,
device=device,
)
count = 0
for index in range(16):
for i in range(8):
for book in range(nbooks):
codebooks[book, index, 0, i] = count * (10**book)
count += 1
print("codes shape", codes.shape)
for i in range(16):
for book in range(nbooks):
codes[0, i, book] = i
codes[0, -i, book] = i
weights = dequantize_weight(codes, codebooks, None)
weights2 = ops.aqlm_dequant(codes, codebooks, parts)
print("weights shape:", weights.shape)
print("weights2 shape:", weights2.shape)
print("weights are:", weights)
print("weights2 are:", weights2)
print("first 128 weights are", weights[0, 0:128].to(torch.int32))
print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32))
print("last 128 weights are", weights[0, -128:])
print("last 128 weights2 are:", weights2[0, -128:])
def main():
parser = FlexibleArgumentParser(description="Benchmark aqlm performance.")
# Add arguments
parser.add_argument(
"--nbooks", type=int, default=1, help="Number of codebooks (default: 1)"
)
parser.add_argument(
"--bits",
type=int,
default=16,
help="Number of bits per code element (default: 16)",
)
parser.add_argument(
"--test",
type=bool,
default=False,
help="Run the decompression/dequant tester rather than benchmarking "
"(default: False)",
)
# Parse the arguments
args = parser.parse_args()
# Extract values
nbooks = args.nbooks
bits = args.bits
if args.test:
dequant_test(4096, torch.tensor((4096,)), nbooks, bits)
return
# Otherwise, benchmark.
methods = [
ops.aqlm_gemm,
dequant_out_scale,
generic_dequantize_gemm,
optimized_dequantize_gemm,
dequant_weight_scale,
torch_mult,
dequant_no_scale,
]
filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv"
print(f"writing benchmarks to file {filename}")
with open(filename, "w") as f:
sys.stdout = f
print("m | k | n | n parts", end="")
for method in methods:
print(f" | {method.__name__.replace('_', ' ')} (µs)", end="")
print("")
# These are reasonable prefill sizes.
ksandpartions = (
(4096, (4096, 4096, 4096)),
(4096, (4096,)),
(4096, (11008, 11008)),
(11008, (4096,)),
)
# reasonable ranges for m.
for m in [
1,
2,
4,
8,
10,
12,
14,
16,
24,
32,
48,
52,
56,
64,
96,
112,
128,
256,
512,
1024,
1536,
2048,
3072,
4096,
]:
print(f"{m}", file=sys.__stdout__)
for ksp in ksandpartions:
run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits, methods)
sys.stdout = sys.__stdout__
def run_grid(m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, methods):
# I didn't see visible improvements from increasing these, but feel free :)
num_warmup_trials = 1
num_trials = 1
num_calls = 100
# warmup.
for method in methods:
for _ in range(num_warmup_trials):
run_timing(
num_calls=num_calls,
m=m,
k=k,
parts=parts,
nbooks=nbooks,
bits=bits,
method=method,
)
n = parts.sum().item()
print(f"{m} | {k} | {n} | {parts.tolist()}", end="")
for method in methods:
best_time_us = 1e20
for _ in range(num_trials):
kernel_dur_ms = run_timing(
num_calls=num_calls,
m=m,
k=k,
parts=parts,
nbooks=nbooks,
bits=bits,
method=method,
)
kernel_dur_us = 1000 * kernel_dur_ms
if kernel_dur_us < best_time_us:
best_time_us = kernel_dur_us
print(f" | {kernel_dur_us:.0f}", end="")
print("")
def run_timing(
num_calls: int, m: int, k: int, parts: torch.Tensor, nbooks: int, bits: int, method
) -> float:
n = int(parts.sum().item())
device = torch.device("cuda:0")
input = torch.randn((1, m, k), dtype=torch.float16, device=device)
code_range = (1 << bits) // 2
ingroups = 8
codes = torch.randint(
-code_range,
code_range,
size=(n, k // ingroups, nbooks),
dtype=get_int_dtype(bits),
device=device,
)
codebooks = torch.randn(
size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
dtype=torch.float16,
device=device,
)
scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device)
# for comparison to just a pytorch mult.
weights = torch.randn((n, k), dtype=torch.float16, device=device)
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()
if method is torch_mult:
for i in range(num_calls):
torch_mult(input, weights, scales)
else:
for i in range(num_calls):
method(input, codes, codebooks, scales, parts, None)
end_event.record()
end_event.synchronize()
dur_ms = start_event.elapsed_time(end_event) / num_calls
return dur_ms
if __name__ == "__main__":
sys.exit(main())

View File

@ -3,6 +3,8 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
from packaging import version
from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
MINIMUM_BITBLAS_VERSION,
)
@ -10,7 +12,7 @@ from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
try:
import bitblas
if bitblas.__version__ < MINIMUM_BITBLAS_VERSION:
if version.parse(bitblas.__version__) < version.parse(MINIMUM_BITBLAS_VERSION):
raise ImportError(
"bitblas version is wrong. Please "
f"install bitblas>={MINIMUM_BITBLAS_VERSION}"

View File

@ -236,6 +236,7 @@ def marlin_create_bench_fn(bt: BenchmarkTensors) -> Callable:
a=bt.a,
c=None,
b_q_weight=w_q,
b_bias=None,
b_scales=w_s,
global_scale=None,
b_zeros=w_zp,

View File

@ -3,6 +3,7 @@
import argparse
import json
import os
import time
from contextlib import nullcontext
from datetime import datetime
@ -22,6 +23,13 @@ from vllm.utils import FlexibleArgumentParser
FP8_DTYPE = current_platform.fp8_dtype()
def ensure_divisibility(numerator, denominator, text):
"""Ensure that numerator is divisible by the denominator."""
assert numerator % denominator == 0, "{} {} is not divisible by tp {}.".format(
text, numerator, denominator
)
class BenchmarkConfig(TypedDict):
BLOCK_SIZE_M: int
BLOCK_SIZE_N: int
@ -535,6 +543,7 @@ def save_configs(
use_fp8_w8a8: bool,
use_int8_w8a16: bool,
block_quant_shape: list[int],
save_dir: str,
) -> None:
dtype_str = get_config_dtype_str(
dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8
@ -545,7 +554,8 @@ def save_configs(
filename = get_config_file_name(
num_experts, shard_intermediate_size // 2, dtype_str, block_quant_shape
)
os.makedirs(save_dir, exist_ok=True)
filename = os.path.join(save_dir, filename)
print(f"Writing best config to {filename}...")
with open(filename, "w") as f:
json.dump(configs, f, indent=4)
@ -570,12 +580,10 @@ def main(args: argparse.Namespace):
E = config.ffn_config.moe_num_experts
topk = config.ffn_config.moe_top_k
intermediate_size = config.ffn_config.ffn_hidden_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
elif config.architectures[0] == "JambaForCausalLM":
E = config.num_experts
topk = config.num_experts_per_tok
intermediate_size = config.intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
elif config.architectures[0] in (
"DeepseekV3ForCausalLM",
"DeepseekV2ForCausalLM",
@ -584,17 +592,14 @@ def main(args: argparse.Namespace):
E = config.n_routed_experts
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
elif config.architectures[0] in ("Qwen2MoeForCausalLM", "Qwen3MoeForCausalLM"):
E = config.num_experts
topk = config.num_experts_per_tok
intermediate_size = config.moe_intermediate_size
shard_intermediate_size = 2 * intermediate_size // args.tp_size
elif config.architectures[0] in ("HunYuanMoEV1ForCausalLM"):
E = config.num_experts
topk = config.moe_topk[0]
intermediate_size = config.moe_intermediate_size[0]
shard_intermediate_size = 2 * intermediate_size // args.tp_size
else:
# Support for llama4
config = config.get_text_config()
@ -602,8 +607,14 @@ def main(args: argparse.Namespace):
E = config.num_local_experts
topk = config.num_experts_per_tok
intermediate_size = config.intermediate_size
enable_ep = bool(args.enable_expert_parallel)
if enable_ep:
ensure_divisibility(E, args.tp_size, "Number of experts")
E = E // args.tp_size
shard_intermediate_size = 2 * intermediate_size
else:
ensure_divisibility(intermediate_size, args.tp_size, "intermediate_size")
shard_intermediate_size = 2 * intermediate_size // args.tp_size
hidden_size = config.hidden_size
dtype = torch.float16 if current_platform.is_rocm() else config.torch_dtype
use_fp8_w8a8 = args.dtype == "fp8_w8a8"
@ -699,6 +710,7 @@ def main(args: argparse.Namespace):
use_fp8_w8a8,
use_int8_w8a16,
block_quant_shape,
args.save_dir,
)
end = time.time()
print(f"Tuning took {end - start:.2f} seconds")
@ -735,10 +747,14 @@ if __name__ == "__main__":
parser.add_argument(
"--tp-size", "-tp", "--tensor-parallel-size", type=int, default=2
)
parser.add_argument("--enable-expert-parallel", "-enable-ep", action="store_true")
parser.add_argument(
"--dtype", type=str, choices=["auto", "fp8_w8a8", "int8_w8a16"], default="auto"
)
parser.add_argument("--use-deep-gemm", action="store_true")
parser.add_argument(
"--save-dir", type=str, default="./", help="Directory to save tuned results"
)
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--batch-size", type=int, nargs="+", required=False)
parser.add_argument("--tune", action="store_true")

View File

@ -0,0 +1,328 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# This script benchmarks the mrope kernel (mainly for Qwen2VL and Qwen2.5VL models).
# It generates test data, runs benchmarks, and saves results to a CSV file.
#
# The CSV file (named with current date/time) contains these columns:
# model_name, tp_size, num_tokens, num_heads, num_kv_heads, head_dim, max_position,
# rope_theta, is_neox_style, rope_scaling, dtype, torch_mean, torch_median, torch_p99,
# torch_min, torch_max, triton_mean, triton_median, triton_p99, triton_min, triton_max,
# speedup
#
# == Usage Examples ==
#
# Single model benchmark:
# python3 benchmark_mrope.py --model-name Qwen/Qwen2-VL-7B-Instruct --tp-size 1 \
# --warmup-iter 10 --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024
#
# All models benchmark:
# python3 benchmark_mrope.py --model-name "" --tp-size 1 --warmup-iter 10 \
# --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024
#
# All models with different TP sizes:
# python3 benchmark_mrope.py --model-name "" --tp-size 1 2 4 8 --warmup-iter 10 \
# --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024
#
# All models with different token counts:
# python3 benchmark_mrope.py --model-name "" --tp-size 1 --warmup-iter 10 \
# --benchmark-iter 100 --dtype bfloat16 --seed 0 --num-tokens 1024 4096 16384
import csv
import os
import time
from datetime import datetime
from typing import Any
import numpy as np
import torch
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.platforms import current_platform
from vllm.transformers_utils.config import get_config
from vllm.utils import FlexibleArgumentParser
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def generate_test_data(
num_tokens: int,
num_q_heads: int,
num_kv_heads: int,
head_size: int,
max_position_embeddings: int,
dtype: torch.dtype,
device: torch.device,
):
"""Generate test data for given configuration."""
# Create 2D positions (3, num_tokens) for multimodal case
positions = torch.randint(
0, max_position_embeddings // 4, (3, num_tokens), device=device
)
# Create query and key tensors
query = torch.randn(num_tokens, num_q_heads * head_size, dtype=dtype, device=device)
key = torch.randn(num_tokens, num_kv_heads * head_size, dtype=dtype, device=device)
return positions, query, key
def calculate_stats(times: list[float]) -> dict[str, float]:
"""Calculate statistics from a list of times."""
times_array = np.array(times)
return {
"mean": np.mean(times_array),
"median": np.median(times_array),
"p99": np.percentile(times_array, 99),
"min": np.min(times_array),
"max": np.max(times_array),
}
def benchmark_mrope(
model_name: str,
num_tokens: int,
head_dim: int,
tp_size: int,
num_heads: int,
num_kv_heads: int,
max_position: int = 8192,
rope_theta: float = 10000,
is_neox_style: bool = True,
rope_scaling: dict[str, Any] = None,
dtype: torch.dtype = torch.bfloat16,
seed: int = 0,
warmup_iter: int = 10,
benchmark_iter: int = 100,
csv_writer=None,
):
current_platform.seed_everything(seed)
torch.set_default_device(device)
# the parameters to compute the q k v size based on tp_size
mrope_helper_class = get_rope(
head_size=head_dim,
rotary_dim=head_dim,
max_position=max_position,
base=rope_theta,
is_neox_style=is_neox_style,
rope_scaling=rope_scaling,
dtype=dtype,
).to(device=device)
print(80 * "=")
print(
f"Evaluating model: {model_name} "
f"with tp_size: {tp_size} "
f"and num_tokens: {num_tokens}, "
f"dtype: {dtype}"
)
# create q k v input tensors
# create rotary pos emb input tensors
positions, query, key = generate_test_data(
num_tokens, num_heads, num_kv_heads, head_dim, max_position, dtype, device
)
# Warm up
for _ in range(warmup_iter):
mrope_helper_class.forward_native(
positions,
query.clone(),
key.clone(),
)
mrope_helper_class.forward_cuda(
positions,
query.clone(),
key.clone(),
)
torch.cuda.synchronize()
# Time reference implementation
torch_times = []
for _ in range(benchmark_iter):
query_clone = query.clone()
key_clone = key.clone()
torch.cuda.synchronize()
start_time = time.time()
mrope_helper_class.forward_native(
positions,
query_clone,
key_clone,
)
torch.cuda.synchronize()
torch_times.append(time.time() - start_time)
# Time triton kernel implementation
triton_times = []
for _ in range(benchmark_iter):
query_clone = query.clone()
key_clone = key.clone()
torch.cuda.synchronize()
start_time = time.time()
mrope_helper_class.forward_cuda(
positions,
query_clone,
key_clone,
)
torch.cuda.synchronize()
triton_times.append(time.time() - start_time)
# Calculate statistics
torch_stats = calculate_stats(torch_times)
triton_stats = calculate_stats(triton_times)
print(f"\nPerformance for config ({num_tokens}, {num_heads}, {num_kv_heads}):")
print(
f"Torch implementation: "
f"mean={torch_stats['mean']:.8f}s, "
f"median={torch_stats['median']:.8f}s, "
f"p99={torch_stats['p99']:.8f}s"
)
print(
f"Triton implementation: "
f"mean={triton_stats['mean']:.8f}s, "
f"median={triton_stats['median']:.8f}s, "
f"p99={triton_stats['p99']:.8f}s"
)
print(
f"Triton Speedup over Torch: {torch_stats['mean'] / triton_stats['mean']:.8f}x"
)
# Write to CSV
if csv_writer:
row = [
model_name,
tp_size,
num_tokens,
num_heads,
num_kv_heads,
head_dim,
max_position,
rope_theta,
is_neox_style,
str(rope_scaling),
str(dtype).split(".")[-1],
torch_stats["mean"],
torch_stats["median"],
torch_stats["p99"],
torch_stats["min"],
torch_stats["max"],
triton_stats["mean"],
triton_stats["median"],
triton_stats["p99"],
triton_stats["min"],
triton_stats["max"],
torch_stats["mean"] / triton_stats["mean"], # speedup
]
csv_writer.writerow(row)
return torch_stats, triton_stats
if __name__ == "__main__":
parser = FlexibleArgumentParser(
description="Benchmark the rotary embedding kernels."
)
parser.add_argument("--model-name", type=str, default="")
parser.add_argument("--tp-size", type=int, default=1)
parser.add_argument("--warmup-iter", type=int, default=10)
parser.add_argument("--benchmark-iter", type=int, default=100)
parser.add_argument("--dtype", type=str, choices=["bfloat16"], default="bfloat16")
parser.add_argument("--seed", type=int, default=0)
parser.add_argument("--num-tokens", type=int, nargs="+", required=False)
parser.add_argument("--trust-remote-code", action="store_true")
parser.add_argument("--output-csv", type=str, default="mrope_benchmark_results.csv")
args = parser.parse_args()
print(args)
# Create CSV file for results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
csv_filename = f"{os.path.splitext(args.output_csv)[0]}_{timestamp}.csv"
with open(csv_filename, "w", newline="") as csvfile:
csv_writer = csv.writer(csvfile)
# Write header
header = [
"model_name",
"tp_size",
"num_tokens",
"num_heads",
"num_kv_heads",
"head_dim",
"max_position",
"rope_theta",
"is_neox_style",
"rope_scaling",
"dtype",
"torch_mean",
"torch_median",
"torch_p99",
"torch_min",
"torch_max",
"triton_mean",
"triton_median",
"triton_p99",
"triton_min",
"triton_max",
"speedup",
]
csv_writer.writerow(header)
model_tp_dict = {}
if args.model_name == "":
model_tp_dict = {
"Qwen/Qwen2-VL-2B-Instruct": [1],
"Qwen/Qwen2-VL-7B-Instruct": [1],
"Qwen/Qwen2-VL-72B-Instruct": [2, 4, 8],
"Qwen/Qwen2.5-VL-3B-Instruct": [1, 2, 4, 8],
"Qwen/Qwen2.5-VL-7B-Instruct": [1, 2, 4, 8],
"Qwen/Qwen2.5-VL-72B-Instruct": [2, 4, 8],
}
else:
model_tp_dict[args.model_name] = [args.tp_size]
if args.num_tokens is None:
num_tokens_list = [2**i for i in range(0, 18)]
else:
num_tokens_list = args.num_tokens
for model_name, tp_list in model_tp_dict.items():
config = get_config(model_name, trust_remote_code=args.trust_remote_code)
for tp_size in tp_list:
# get the model config
total_num_kv_heads = config.num_key_value_heads
total_num_heads = config.num_attention_heads
num_heads = total_num_heads // tp_size
num_kv_heads = max(1, total_num_kv_heads // tp_size)
head_dim = config.hidden_size // total_num_heads
q_size = num_heads * head_dim
kv_size = num_kv_heads * head_dim
is_neox_style = True
rope_theta = config.rope_theta
max_position = config.max_position_embeddings
for num_tokens in num_tokens_list:
benchmark_mrope(
model_name=model_name,
num_tokens=num_tokens,
head_dim=head_dim,
tp_size=tp_size,
num_heads=num_heads,
num_kv_heads=num_kv_heads,
max_position=max_position,
rope_theta=rope_theta,
is_neox_style=is_neox_style,
rope_scaling=config.rope_scaling,
dtype=getattr(torch, args.dtype),
seed=args.seed,
warmup_iter=args.warmup_iter,
benchmark_iter=args.benchmark_iter,
csv_writer=csv_writer,
)
print(f"Benchmark results saved to {csv_filename}")

View File

@ -0,0 +1,159 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import math
from contextlib import contextmanager
from typing import Callable
from unittest.mock import patch
import torch
from vllm.model_executor.layers.quantization.utils import fp8_utils, int8_utils
from vllm.platforms import current_platform
@contextmanager
def _triton_mode():
"""Temporarily force the Triton fallback path"""
with patch("vllm.platforms.current_platform.is_cuda", return_value=False):
yield
def _time_cuda(
fn: Callable[[], tuple[torch.Tensor, torch.Tensor]],
warmup_iters: int,
bench_iters: int,
) -> float:
# warmup
for _ in range(warmup_iters):
fn()
torch.cuda.synchronize()
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
start.record()
for _ in range(bench_iters):
fn()
end.record()
torch.cuda.synchronize()
return start.elapsed_time(end) / bench_iters # ms/iter
def _run_single(
shape: tuple[int, int],
group_size: int,
dtype: str,
*,
column_major: bool = False,
scale_ue8m0: bool = False,
warmup_iters: int,
bench_iters: int,
) -> None:
num_tokens, hidden_dim = shape
device = torch.device("cuda")
torch.manual_seed(42)
x = torch.randn(num_tokens, hidden_dim, device=device, dtype=torch.bfloat16) * 8
if dtype == "fp8":
def cuda_impl():
return fp8_utils.per_token_group_quant_fp8(
x,
group_size,
column_major_scales=column_major,
use_ue8m0=scale_ue8m0,
)
def triton_impl():
with _triton_mode():
return fp8_utils.per_token_group_quant_fp8(
x,
group_size,
column_major_scales=column_major,
use_ue8m0=scale_ue8m0,
)
elif dtype == "int8":
def cuda_impl():
return int8_utils.per_token_group_quant_int8(x, group_size)
def triton_impl():
with _triton_mode():
return int8_utils.per_token_group_quant_int8(x, group_size)
else:
raise ValueError("dtype must be 'fp8' or 'int8'")
cuda_ms = _time_cuda(cuda_impl, warmup_iters, bench_iters)
triton_ms = _time_cuda(triton_impl, warmup_iters, bench_iters)
speedup = triton_ms / cuda_ms if cuda_ms else math.inf
cfg_desc = (
f"shape={shape} gs={group_size:<3} col_major={column_major:<5} "
f"ue8m0={scale_ue8m0:<5} dtype={dtype}"
)
print(
f"{cfg_desc:55} | CUDA {cuda_ms:7.3f} ms | Triton {triton_ms:7.3f} ms | "
f"speed-up ×{speedup:5.2f}"
)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("--warmup-iters", type=int, default=10)
parser.add_argument("--bench-iters", type=int, default=100)
parser.add_argument("--dtype", choices=["fp8", "int8", "both"], default="both")
return parser.parse_args()
if __name__ == "__main__":
if not current_platform.is_cuda():
raise RuntimeError("CUDA device is required to run this benchmark.")
args = parse_args()
warmup_iters, bench_iters = args.warmup_iters, args.bench_iters
shapes = [(32, 128), (64, 256), (16, 512)]
group_sizes = [64, 128]
dtypes = ["fp8", "int8"] if args.dtype == "both" else [args.dtype]
header = (
"Configuration".ljust(55)
+ " | "
+ "CUDA (ms)".center(12)
+ " | "
+ "Triton (ms)".center(13)
+ " | "
+ "Speed-up"
)
print(header)
print("-" * len(header))
for dtype in dtypes:
for shape in shapes:
for gs in group_sizes:
if dtype == "fp8":
for col_major in (False, True):
for ue8m0 in (False, True):
_run_single(
shape,
gs,
dtype,
column_major=col_major,
scale_ue8m0=ue8m0,
warmup_iters=warmup_iters,
bench_iters=bench_iters,
)
else: # INT8 has no col-major / ue8m0 switches
_run_single(
shape,
gs,
dtype,
warmup_iters=warmup_iters,
bench_iters=bench_iters,
)

View File

@ -0,0 +1,156 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from __future__ import annotations
import random
import time
import torch
from tabulate import tabulate
from vllm import _custom_ops as ops
from vllm.logger import init_logger
from vllm.platforms import current_platform
from vllm.utils import (
STR_DTYPE_TO_TORCH_DTYPE,
FlexibleArgumentParser,
create_kv_caches_with_random_flash,
)
logger = init_logger(__name__)
@torch.inference_mode()
def run_benchmark(
num_tokens: int,
num_heads: int,
head_size: int,
block_size: int,
num_blocks: int,
dtype: torch.dtype,
kv_cache_dtype: str,
kv_cache_layout: str,
num_iters: int,
device: str = "cuda",
) -> float:
"""Return latency (seconds) for given num_tokens."""
if kv_cache_dtype == "fp8" and head_size % 16:
raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.")
current_platform.seed_everything(42)
torch.set_default_device(device)
# create random key / value tensors [T, H, D].
key = torch.randn(num_tokens, num_heads, head_size, dtype=dtype, device=device)
value = torch.randn_like(key)
# prepare the slot mapping.
# each token is assigned a unique slot in the KV-cache.
num_slots = block_size * num_blocks
if num_tokens > num_slots:
raise ValueError("num_tokens cannot exceed the total number of cache slots")
slot_mapping_lst = random.sample(range(num_slots), num_tokens)
slot_mapping = torch.tensor(slot_mapping_lst, dtype=torch.long, device=device)
key_caches, value_caches = create_kv_caches_with_random_flash(
num_blocks,
block_size,
1, # num_layers
num_heads,
head_size,
kv_cache_dtype,
dtype,
device=device,
cache_layout=kv_cache_layout,
)
key_cache, value_cache = key_caches[0], value_caches[0]
# compute per-kernel scaling factors for fp8 conversion (if used).
k_scale = (key.amax() / 64.0).to(torch.float32)
v_scale = (value.amax() / 64.0).to(torch.float32)
def run_cuda_benchmark(n_iters: int) -> float:
nonlocal key, value, key_cache, value_cache, slot_mapping
torch.cuda.synchronize()
start = time.perf_counter()
for _ in range(n_iters):
ops.reshape_and_cache_flash(
key,
value,
key_cache,
value_cache,
slot_mapping,
kv_cache_dtype,
k_scale,
v_scale,
)
torch.cuda.synchronize()
end = time.perf_counter()
return (end - start) / n_iters
# warm-up
run_cuda_benchmark(3)
lat = run_cuda_benchmark(num_iters)
# free tensors to mitigate OOM when sweeping
del key, value, key_cache, value_cache, slot_mapping
torch.cuda.empty_cache()
return lat
def main(args):
rows = []
for layout in ["NHD", "HND"]:
for exp in range(1, 17):
n_tok = 2**exp
lat = run_benchmark(
num_tokens=n_tok,
num_heads=args.num_heads,
head_size=args.head_size,
block_size=args.block_size,
num_blocks=args.num_blocks,
dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype],
kv_cache_dtype=args.kv_cache_dtype,
kv_cache_layout=layout,
num_iters=args.iters,
device="cuda",
)
rows.append([n_tok, layout, f"{lat * 1e6:.3f}"])
print(tabulate(rows, headers=["num_tokens", "layout", "latency (µs)"]))
if __name__ == "__main__":
parser = FlexibleArgumentParser()
parser.add_argument("--num-heads", type=int, default=128)
parser.add_argument(
"--head-size",
type=int,
choices=[64, 80, 96, 112, 120, 128, 192, 256],
default=128,
)
parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
parser.add_argument("--num-blocks", type=int, default=128 * 512)
parser.add_argument(
"--dtype",
type=str,
choices=["half", "bfloat16", "float"],
default="bfloat16",
)
parser.add_argument(
"--kv-cache-dtype",
type=str,
choices=["auto", "fp8"],
default="auto",
)
parser.add_argument("--iters", type=int, default=100)
args = parser.parse_args()
main(args)

View File

@ -41,7 +41,6 @@ def benchmark_decode(
device = "cuda"
torch.manual_seed(0)
# Currently only HEAD_GRP_SIZE == 8 is supported
HEAD_GRP_SIZE = 8
MAX_SEQ_LEN = max_seq_len
@ -71,22 +70,20 @@ def benchmark_decode(
if kv_cache_dtype.startswith("fp8"):
kv_cache, _ = to_float8(kv_cache)
output_trtllm = torch.empty(q.shape, dtype=dtype)
# Benchmark TRT decode
def trt_decode():
return flashinfer.decode.trtllm_batch_decode_with_kv_cache(
q,
kv_cache,
workspace_buffer,
num_qo_heads,
num_kv_heads,
sm_scale,
block_tables,
kv_lens_tensor,
page_size,
max_kv_len,
kv_cache_dtype,
k_scale,
v_scale,
bmm1_scale=k_scale * sm_scale,
bmm2_scale=v_scale,
out=output_trtllm,
)
def time_fn(fn, warmup=10, trials=20):
@ -125,6 +122,8 @@ def benchmark_decode(
kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
output_baseline = torch.empty(q.shape, dtype=dtype)
wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
workspace_buffer,
kv_layout,
@ -145,7 +144,7 @@ def benchmark_decode(
)
def baseline_decode():
return wrapper.run(q, kv_cache, sm_scale, k_scale, v_scale)
return wrapper.run(q, kv_cache, sm_scale, k_scale, v_scale, output_baseline)
baseline_mean, baseline_std = time_fn(baseline_decode)
@ -214,25 +213,39 @@ if __name__ == "__main__":
max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
all_results = []
print("Running benchmark for kv_cache_dtype: bfloat16")
print(
"\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\tbaseline_std\tspeedup_percent"
"Running benchmark for q_dtype = bfloat16, kv_cache_dtype: bfloat16, "
"output_dtype: bfloat16"
)
print(
"\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t"
"baseline_std\tspeedup_percent"
)
for max_seq_len in max_seq_lens:
for bs in num_seqs:
result = benchmark_decode(
bs, max_seq_len, dtype=torch.bfloat16, kv_cache_dtype="auto"
bs,
max_seq_len,
dtype=torch.bfloat16,
kv_cache_dtype="auto",
)
all_results.append(result)
print("Running benchmark for q_dtype = bfloat16, kv_cache_dtype: fp8")
print(
"\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\tbaseline_std\tspeedup_percent"
"Running benchmark for q_dtype = bfloat16, kv_cache_dtype: fp8, "
"output_dtype: bfloat16"
)
print(
"\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t"
"baseline_std\tspeedup_percent"
)
for max_seq_len in max_seq_lens:
for bs in num_seqs:
result = benchmark_decode(
bs, max_seq_len, dtype=torch.bfloat16, kv_cache_dtype="fp8"
bs,
max_seq_len,
dtype=torch.bfloat16,
kv_cache_dtype="fp8",
)
all_results.append(result)

View File

@ -0,0 +1,250 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import csv
import os
import random
from datetime import datetime
import flashinfer
import torch
FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
# KV Cache Layout for TRT-LLM
# kv_cache_shape = (num_blocks, 2, num_kv_heads, page_size, head_dim)
def to_float8(x, dtype=torch.float8_e4m3fn):
finfo = torch.finfo(dtype)
min_val, max_val = x.aminmax()
amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
scale = finfo.max / amax * 0.1
x_scl_sat = (x * scale).clamp(min=finfo.min, max=finfo.max)
return x_scl_sat.to(dtype), scale.float().reciprocal()
@torch.no_grad()
def benchmark_prefill(
num_seqs,
max_seq_len,
page_size=16,
dtype=torch.bfloat16,
kv_layout="HND",
num_kv_heads=8,
kv_cache_dtype="auto",
head_dim=128,
warmup=10,
trials=20,
):
torch.set_default_device("cuda")
torch.manual_seed(0)
HEAD_GRP_SIZE = 8
MAX_SEQ_LEN = max_seq_len
# large number to reduce kv_cache reuse
NUM_BLOCKS = int(256000 / page_size)
workspace_buffer = torch.empty(1024 * 1024 * 1024, dtype=torch.int8)
num_qo_heads = num_kv_heads * HEAD_GRP_SIZE
sm_scale = float(1.0 / (head_dim**0.5))
q_lens = [random.randint(1, MAX_SEQ_LEN) for _ in range(num_seqs)]
q_lens[-1] = MAX_SEQ_LEN
max_q_len = max(q_lens)
q_indptr = torch.cat(
[
torch.tensor([0], dtype=torch.int32),
torch.cumsum(
torch.tensor(q_lens, dtype=torch.int32), dim=0, dtype=torch.int32
),
]
)
q = torch.randn(sum(q_lens), num_qo_heads, head_dim, dtype=dtype)
kv_lens = [random.randint(0, MAX_SEQ_LEN) for _ in range(num_seqs)]
kv_lens[-1] = MAX_SEQ_LEN
seq_lens = [q_len + kv_len for q_len, kv_len in zip(q_lens, kv_lens)]
max_seq_len = max(seq_lens)
seq_lens_tensor = torch.tensor(seq_lens, dtype=torch.int32)
max_num_blocks_per_seq = (max_seq_len + page_size - 1) // page_size
block_tables = torch.randint(
0, NUM_BLOCKS, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32
)
kv_cache_shape = (NUM_BLOCKS, 2, num_kv_heads, page_size, head_dim)
kv_cache = torch.randn(size=kv_cache_shape, dtype=dtype)
k_scale = v_scale = 1.0
if kv_cache_dtype.startswith("fp8"):
kv_cache, _ = to_float8(kv_cache)
output_trtllm = torch.empty(q.shape, dtype=dtype)
kv_indptr = [0]
kv_indices = []
kv_last_page_lens = []
for i in range(num_seqs):
seq_len = seq_lens[i]
assert seq_len > 0
num_blocks = (seq_len + page_size - 1) // page_size
kv_indices.extend(block_tables[i, :num_blocks])
kv_indptr.append(kv_indptr[-1] + num_blocks)
kv_last_page_len = seq_len % page_size
if kv_last_page_len == 0:
kv_last_page_len = page_size
kv_last_page_lens.append(kv_last_page_len)
kv_indptr = torch.tensor(kv_indptr, dtype=torch.int32)
kv_indices = torch.tensor(kv_indices, dtype=torch.int32)
kv_last_page_lens = torch.tensor(kv_last_page_lens, dtype=torch.int32)
output_baseline = torch.empty(q.shape, dtype=dtype)
wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
workspace_buffer, kv_layout
)
wrapper.plan(
q_indptr,
kv_indptr,
kv_indices,
kv_last_page_lens,
num_qo_heads,
num_kv_heads,
head_dim,
page_size,
causal=True,
sm_scale=sm_scale,
q_data_type=dtype,
kv_data_type=kv_cache.dtype,
)
def time_fn(fn, warmup=10, trials=20):
torch.cuda.synchronize()
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
times = []
for i in range(warmup):
fn()
for i in range(trials):
start.record()
fn()
end.record()
torch.cuda.synchronize()
times.append(start.elapsed_time(end)) # ms
return sum(times) / len(times), torch.std(torch.tensor(times))
def baseline_prefill():
return wrapper.run(
q, kv_cache, k_scale=k_scale, v_scale=v_scale, out=output_baseline
)
def trt_prefill():
return flashinfer.prefill.trtllm_batch_context_with_kv_cache(
query=q,
kv_cache=kv_cache,
workspace_buffer=workspace_buffer,
block_tables=block_tables,
seq_lens=seq_lens_tensor,
max_q_len=max_q_len,
max_kv_len=max_seq_len,
bmm1_scale=k_scale * sm_scale,
bmm2_scale=v_scale,
batch_size=num_seqs,
cum_seq_lens_q=q_indptr,
cum_seq_lens_kv=kv_indptr,
out=output_trtllm,
)
trt_mean, trt_std = time_fn(trt_prefill)
baseline_mean, baseline_std = time_fn(baseline_prefill)
# Calculate percentage speedup (positive means TRT is faster)
speedup_percent = (baseline_mean - trt_mean) / baseline_mean
print(
f"\t{num_seqs}\t{max_seq_len}\t{trt_mean:.5f}\t{trt_std.item():.5f}"
f"\t{baseline_mean:.5f}\t{baseline_std.item():.5f}\t{speedup_percent:.5f}"
)
# Return results for CSV writing
return {
"num_seqs": num_seqs,
"trt_mean": trt_mean,
"trt_std": trt_std.item(),
"baseline_mean": baseline_mean,
"baseline_std": baseline_std.item(),
"speedup_percent": speedup_percent,
"q_dtype": str(dtype),
"kv_cache_dtype": kv_cache_dtype,
"page_size": page_size,
"num_kv_heads": num_kv_heads,
"head_dim": head_dim,
"max_seq_len": max_seq_len,
}
def write_results_to_csv(results, filename=None):
"""Write benchmark results to CSV file."""
if filename is None:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"flashinfer_trtllm_benchmark_{timestamp}.csv"
fieldnames = [
"num_seqs",
"trt_mean",
"trt_std",
"baseline_mean",
"baseline_std",
"speedup_percent",
"q_dtype",
"kv_cache_dtype",
"page_size",
"num_kv_heads",
"head_dim",
"max_seq_len",
]
file_exists = os.path.exists(filename)
with open(filename, "a", newline="") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
if not file_exists:
writer.writeheader()
for result in results:
writer.writerow(result)
print(f"Results written to {filename}")
if __name__ == "__main__":
num_seqs = [1, 4, 8, 16, 32, 64, 128, 256]
max_seq_lens = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
all_results = []
print(
"Running benchmark for q_dtype = bfloat16, kv_cache_dtype: bfloat16, "
"output_dtype: bfloat16"
)
print(
"\tnum_seqs\tmax_seq_len\ttrt_mean\ttrt_std\tbaseline_mean\t"
"baseline_std\tspeedup_percent"
)
for max_seq_len in max_seq_lens:
for bs in num_seqs:
result = benchmark_prefill(
bs,
max_seq_len,
dtype=torch.bfloat16,
kv_cache_dtype="auto",
)
all_results.append(result)
# Write all results to CSV
write_results_to_csv(all_results)

View File

@ -8,7 +8,7 @@ Currently this just includes dense GEMMs and only works on Hopper GPUs.
You need to install vLLM in your usual fashion, then install DeepGEMM from source in its own directory:
```
```bash
git clone --recursive https://github.com/deepseek-ai/DeepGEMM
cd DeepGEMM
python setup.py install
@ -17,7 +17,7 @@ uv pip install -e .
## Usage
```
```console
python benchmark_fp8_block_dense_gemm.py
INFO 02-26 21:55:13 [__init__.py:207] Automatically detected platform cuda.
===== STARTING FP8 GEMM BENCHMARK =====

View File

@ -4,49 +4,16 @@
# ruff: noqa: E501
import time
# Import DeepGEMM functions
import deep_gemm
import torch
from deep_gemm import calc_diff, ceil_div, get_col_major_tma_aligned_tensor
# Import vLLM functions
from vllm import _custom_ops as ops
from vllm.model_executor.layers.quantization.utils.fp8_utils import (
get_col_major_tma_aligned_tensor,
per_token_group_quant_fp8,
w8a8_block_fp8_matmul,
)
from vllm.triton_utils import triton
# Copied from
# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L9
def per_token_cast_to_fp8(
x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
"""Convert tensor to FP8 format with per-token scaling."""
assert x.dim() == 2 and x.size(1) % 128 == 0
m, n = x.shape
x_view = x.view(m, -1, 128)
x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
return (x_view * (448.0 / x_amax.unsqueeze(2))).to(
torch.float8_e4m3fn).view(m, n), (x_amax / 448.0).view(m, -1)
# Copied from
# https://github.com/deepseek-ai/DeepGEMM/blob/78cacf70d41d15d688bd493ebc85845f7f2a3d5d/tests/test_core.py#L17
def per_block_cast_to_fp8(
x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
"""Convert tensor to FP8 format with per-block scaling."""
assert x.dim() == 2
m, n = x.shape
x_padded = torch.zeros((ceil_div(m, 128) * 128, ceil_div(n, 128) * 128),
dtype=x.dtype,
device=x.device)
x_padded[:m, :n] = x
x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
return x_scaled.view_as(x_padded)[:m, :n].contiguous(), (
x_amax / 448.0).view(x_view.size(0), x_view.size(2))
from vllm.utils.deep_gemm import calc_diff, fp8_gemm_nt, per_block_cast_to_fp8
def benchmark_shape(m: int,
@ -69,14 +36,14 @@ def benchmark_shape(m: int,
# Pre-quantize B for all implementations
# (weights can be pre-quantized offline)
B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B)
B_vllm, B_scale_vllm = per_block_cast_to_fp8(B)
B_deepgemm, B_scale_deepgemm = per_block_cast_to_fp8(B, [128, 128], use_ue8m0=True)
B_vllm, B_scale_vllm = per_block_cast_to_fp8(B, [128, 128], use_ue8m0=True)
# Block size configuration
block_size = [128, 128]
# Pre-quantize A for all implementations
A_deepgemm, A_scale_deepgemm = per_token_cast_to_fp8(A)
A_deepgemm, A_scale_deepgemm = per_token_group_quant_fp8(A, block_size[1])
A_scale_deepgemm = get_col_major_tma_aligned_tensor(A_scale_deepgemm)
C_deepgemm = torch.empty((m, n), device='cuda', dtype=torch.bfloat16)
A_vllm, A_scale_vllm = per_token_group_quant_fp8(A, block_size[1])
@ -85,7 +52,7 @@ def benchmark_shape(m: int,
# === DeepGEMM Implementation ===
def deepgemm_gemm():
deep_gemm.gemm_fp8_fp8_bf16_nt((A_deepgemm, A_scale_deepgemm),
fp8_gemm_nt((A_deepgemm, A_scale_deepgemm),
(B_deepgemm, B_scale_deepgemm),
C_deepgemm)
return C_deepgemm

View File

@ -1,108 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import gc
import time
from typing import Optional
from tabulate import tabulate
from vllm.utils import FlexibleArgumentParser
from vllm.v1.core.block_pool import BlockPool
class Metric:
def __init__(self) -> None:
self.cnt: int = 0
self.sum_v: int = 0
self.max_v: Optional[int] = None
def update(self, v: int) -> None:
self.cnt += 1
self.sum_v += v
if self.max_v is None:
self.max_v = v
else:
self.max_v = max(self.max_v, v)
def avg_v(self) -> float:
return self.sum_v * 1.0 / self.cnt
def main(args):
rows = []
for allocate_block in args.allocate_blocks:
# Enforce a GC collect ahead to minimize the impact among runs
gc.collect()
block_pool = BlockPool(num_gpu_blocks=args.num_gpu_blocks, enable_caching=True)
get_blocks_metric: Metric = Metric()
free_blocks_metric: Metric = Metric()
for _ in range(args.num_iteration):
t1 = time.monotonic_ns()
blocks = block_pool.get_new_blocks(allocate_block)
t2 = time.monotonic_ns()
block_pool.free_blocks(blocks)
t3 = time.monotonic_ns()
get_blocks_metric.update(t2 - t1)
free_blocks_metric.update(t3 - t2)
if get_blocks_metric.max_v is not None and free_blocks_metric.max_v is not None:
rows.append(
[
get_blocks_metric.cnt,
args.num_gpu_blocks,
allocate_block,
get_blocks_metric.avg_v() / 1000000,
get_blocks_metric.max_v / 1000000.0,
free_blocks_metric.avg_v() / 1000000,
free_blocks_metric.max_v / 1000000.0,
]
)
else:
print(
"No valid metrics found."
f" {get_blocks_metric.max_v=} {free_blocks_metric.max_v=}"
)
print(
tabulate(
rows,
headers=[
"Iterations",
"Total\nBlocks",
"Allocated\nBlocks",
"Get Blocks\nAvg (ms)",
"Get Blocks\nMax (ms)",
"Free Blocks\nAvg (ms)",
"Free Blocks\nMax (ms)",
],
tablefmt="grid",
floatfmt=".6f",
)
)
def invoke_main() -> None:
parser = FlexibleArgumentParser(
description="Benchmark the performance of BlockPool for KV Cache."
)
parser.add_argument("--num-gpu-blocks", type=int, default=100000)
parser.add_argument(
"--num-iteration",
type=int,
default=1000,
help="Number of iterations to run to stablize final data readings",
)
parser.add_argument(
"--allocate-blocks",
type=int,
nargs="*",
default=[10, 50, 100, 500, 1000],
help="Number of blocks to allocate",
)
args = parser.parse_args()
main(args)
if __name__ == "__main__":
invoke_main() # pragma: no cover

View File

@ -0,0 +1,71 @@
# Benchmark KV Cache Offloading with Multi-Turn Conversations
The requirements (pip) for `benchmark_serving_multi_turn.py` can be found in `requirements.txt`
First start serving your model
```bash
export MODEL_NAME=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
vllm serve $MODEL_NAME --disable-log-requests
```
## Synthetic Multi-Turn Conversations
Download the following text file (used for generation of synthetic conversations)
```bash
wget https://www.gutenberg.org/ebooks/1184.txt.utf-8
mv 1184.txt.utf-8 pg1184.txt
```
The filename `pg1184.txt` is used in `generate_multi_turn.json` (see `"text_files"`).
But you may use other text files if you prefer (using this specific file is not required).
Then run the benchmarking script
```bash
export MODEL_NAME=/models/meta-llama/Meta-Llama-3.1-8B-Instruct/
python benchmark_serving_multi_turn.py --model $MODEL_NAME --input-file generate_multi_turn.json \
--num-clients 2 --max-active-conversations 6
```
You can edit the file `generate_multi_turn.json` to change the conversation parameters (number of turns, etc.).
If successful, you will see the following output
```bash
----------------------------------------------------------------------------------------------------
Statistics summary:
runtime_sec = 215.810
requests_per_sec = 0.769
----------------------------------------------------------------------------------------------------
count mean std min 25% 50% 75% 90% 99% max
ttft_ms 166.0 78.22 67.63 45.91 59.94 62.26 64.43 69.66 353.18 567.54
tpot_ms 166.0 25.37 0.57 24.40 25.07 25.31 25.50 25.84 27.50 28.05
latency_ms 166.0 2591.07 326.90 1998.53 2341.62 2573.01 2860.10 3003.50 3268.46 3862.94
input_num_turns 166.0 7.43 4.57 1.00 3.00 7.00 11.00 13.00 17.00 17.00
input_num_tokens 166.0 2006.20 893.56 522.00 1247.75 2019.00 2718.00 3233.00 3736.45 3899.00
output_num_tokens 166.0 100.01 11.80 80.00 91.00 99.00 109.75 116.00 120.00 120.00
output_num_chunks 166.0 99.01 11.80 79.00 90.00 98.00 108.75 115.00 119.00 119.00
----------------------------------------------------------------------------------------------------
```
## ShareGPT Conversations
To run with the ShareGPT data, download the following ShareGPT dataset:
`https://huggingface.co/datasets/philschmid/sharegpt-raw/blob/main/sharegpt_20230401_clean_lang_split.json`
Use the `convert_sharegpt_to_openai.py` script to convert the dataset to a format supported by `benchmark_serving_multi_turn.py`
```bash
python convert_sharegpt_to_openai.py sharegpt_20230401_clean_lang_split.json sharegpt_conv_128.json --seed=99 --max-items=128
```
The script will convert the ShareGPT dataset to a dataset with the standard user/assistant roles.
The flag `--max-items=128` is used to sample 128 conversations from the original dataset (change as needed).
Use the output JSON file `sharegpt_conv_128.json` as the `--input-file` for `benchmark_serving_multi_turn.py`.

View File

@ -0,0 +1,493 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from abc import ABC, abstractmethod
from statistics import mean
from typing import Any, NamedTuple, Optional, Union
import numpy as np # type: ignore
import pandas as pd # type: ignore
from bench_utils import (
TEXT_SEPARATOR,
Color,
logger,
)
from transformers import AutoTokenizer # type: ignore
# Conversation ID is a string (e.g: "UzTK34D")
ConvId = str
# A list of dicts (dicts with keys "id" and "messages")
ShareGptConversations = list[dict[str, Any]]
# A list of dicts (dicts with keys "role" and "content")
MessagesList = list[dict[str, str]]
# Map conversation ID to conversation messages
ConversationsMap = list[ConvId, MessagesList]
class Distribution(ABC):
@abstractmethod
def sample(self, size: int = 1) -> np.ndarray:
pass
class UniformDistribution(Distribution):
def __init__(
self,
min_val: Union[int, float],
max_val: Union[int, float],
is_integer: bool = True,
) -> None:
self.min_val = min_val
self.max_val = max_val
self.is_integer = is_integer
def sample(self, size: int = 1) -> np.ndarray:
if self.is_integer:
return np.random.randint(
int(self.min_val), int(self.max_val + 1), size=size
)
else:
return np.random.uniform(self.min_val, self.max_val, size=size)
def __repr__(self) -> str:
return f"UniformDistribution[{self.min_val}, {self.max_val}]"
class ConstantDistribution(Distribution):
def __init__(self, value: Union[int, float]) -> None:
self.value = value
self.max_val = value
def sample(self, size: int = 1) -> np.ndarray:
return np.full(shape=size, fill_value=self.value)
def __repr__(self) -> str:
return f"Constant[{self.value}]"
class ZipfDistribution(Distribution):
def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
self.alpha = alpha
self.max_val = max_val
def sample(self, size: int = 1) -> np.ndarray:
samples = np.random.zipf(self.alpha, size=size)
if self.max_val:
samples = np.minimum(samples, self.max_val)
return samples
def __repr__(self) -> str:
return f"ZipfDistribution[{self.alpha}]"
class PoissonDistribution(Distribution):
def __init__(self, alpha: float, max_val: Optional[int] = None) -> None:
self.alpha = alpha
self.max_val = max_val
def sample(self, size: int = 1) -> np.ndarray:
samples = np.random.poisson(self.alpha, size=size)
if self.max_val:
samples = np.minimum(samples, self.max_val)
return samples
def __repr__(self) -> str:
return f"PoissonDistribution[{self.alpha}]"
class LognormalDistribution(Distribution):
def __init__(
self, mean: float, sigma: float, max_val: Optional[int] = None
) -> None:
self.mean = mean
self.sigma = sigma
self.max_val = max_val
def sample(self, size: int = 1) -> np.ndarray:
samples = np.random.lognormal(mean=self.mean, sigma=self.sigma, size=size)
if self.max_val:
samples = np.minimum(samples, self.max_val)
return np.round(samples).astype(int)
def __repr__(self) -> str:
return f"LognormalDistribution[{self.mean}, {self.sigma}]"
class GenConvArgs(NamedTuple):
num_conversations: int
text_files: list[str]
input_num_turns: Distribution
input_common_prefix_num_tokens: Distribution
input_prefix_num_tokens: Distribution
input_num_tokens: Distribution
output_num_tokens: Distribution
print_stats: bool
def verify_field_exists(
conf: dict, field_name: str, section: str, subsection: str
) -> None:
if field_name not in conf:
raise ValueError(
f"Missing field '{field_name}' in {section=} and {subsection=}"
)
def get_random_distribution(
conf: dict, section: str, subsection: str, optional: bool = False
) -> Distribution:
# section can be "prompt_input" or "prompt_output" (both required)
conf = conf[section]
if optional and subsection not in conf:
# Optional subsection, if not found assume the value is always 0
return ConstantDistribution(0)
# subsection can be "num_turns", "num_tokens" or "prefix_num_tokens"
if subsection not in conf:
raise ValueError(f"Missing subsection {subsection} in section {section}")
conf = conf[subsection]
distribution = conf.get("distribution")
if distribution is None:
raise ValueError(
f"Missing field 'distribution' in {section=} and {subsection=}"
)
if distribution == "constant":
verify_field_exists(conf, "value", section, subsection)
return ConstantDistribution(conf["value"])
elif distribution == "zipf":
verify_field_exists(conf, "alpha", section, subsection)
max_val = conf.get("max", None)
return ZipfDistribution(conf["alpha"], max_val=max_val)
elif distribution == "poisson":
verify_field_exists(conf, "alpha", section, subsection)
max_val = conf.get("max", None)
return PoissonDistribution(conf["alpha"], max_val=max_val)
elif distribution == "lognormal":
verify_field_exists(conf, "mean", section, subsection)
verify_field_exists(conf, "sigma", section, subsection)
max_val = conf.get("max", None)
return LognormalDistribution(conf["mean"], conf["sigma"], max_val=max_val)
elif distribution == "uniform":
verify_field_exists(conf, "min", section, subsection)
verify_field_exists(conf, "max", section, subsection)
min_value = conf["min"]
max_value = conf["max"]
assert min_value > 0
assert min_value <= max_value
is_integer = isinstance(min_value, int) and isinstance(max_value, int)
return UniformDistribution(min_value, max_value, is_integer)
else:
raise ValueError(f"Unknown distribution: {distribution}")
def parse_input_json_file(conf: dict) -> GenConvArgs:
# Validate the input file
assert isinstance(conf, dict)
required_fields = [
"filetype",
"num_conversations",
"text_files",
"prompt_input",
"prompt_output",
]
for field in required_fields:
assert field in conf, f"Missing field {field} in input {conf}"
assert conf["filetype"] == "generate_conversations"
assert conf["num_conversations"] > 0, "num_conversations should be larger than zero"
text_files = conf["text_files"]
assert isinstance(text_files, list), "Field 'text_files' should be a list"
assert len(text_files) > 0, (
"Field 'text_files' should be a list with at least one file"
)
# Parse the parameters for the prompt input/output workload
input_num_turns = get_random_distribution(conf, "prompt_input", "num_turns")
input_num_tokens = get_random_distribution(conf, "prompt_input", "num_tokens")
input_common_prefix_num_tokens = get_random_distribution(
conf, "prompt_input", "common_prefix_num_tokens", optional=True
)
input_prefix_num_tokens = get_random_distribution(
conf, "prompt_input", "prefix_num_tokens"
)
output_num_tokens = get_random_distribution(conf, "prompt_output", "num_tokens")
print_stats: bool = conf.get("print_stats", False)
assert isinstance(print_stats, bool), (
"Field 'print_stats' should be either 'true' or 'false'"
)
args = GenConvArgs(
num_conversations=conf["num_conversations"],
text_files=text_files,
input_num_turns=input_num_turns,
input_common_prefix_num_tokens=input_common_prefix_num_tokens,
input_prefix_num_tokens=input_prefix_num_tokens,
input_num_tokens=input_num_tokens,
output_num_tokens=output_num_tokens,
print_stats=print_stats,
)
return args
def print_conv_stats(conversations: ConversationsMap, tokenizer: AutoTokenizer) -> None:
# Collect statistics
conv_stats: list[dict[Any, Any]] = []
req_stats: list[int] = []
print("\nCollecting statistics...")
for messages in conversations.values():
# messages is a list of dicts
user_tokens: list[int] = []
assistant_tokens: list[int] = []
request_tokens: list[int] = []
req_tokens = 0
for m in messages:
content = m["content"]
num_tokens = len(tokenizer(content).input_ids)
if m["role"] == "user":
user_tokens.append(num_tokens)
# New user prompt including all chat history
req_tokens += num_tokens
request_tokens.append(req_tokens)
elif m["role"] == "assistant":
assistant_tokens.append(num_tokens)
# Update assistant answer
# (will be part of chat history for the next user prompt)
req_tokens += num_tokens
item_stats = {
"conversation_turns": len(messages),
"user_tokens": mean(user_tokens),
"assistant_tokens": mean(assistant_tokens),
}
conv_stats.append(item_stats)
req_stats.extend(request_tokens)
# Print statistics
percentiles = [0.25, 0.5, 0.75, 0.9, 0.99]
print(TEXT_SEPARATOR)
print(f"{Color.YELLOW}Conversations statistics:{Color.RESET}")
print(TEXT_SEPARATOR)
df = pd.DataFrame(conv_stats)
print(df.describe(percentiles=percentiles).transpose())
print(TEXT_SEPARATOR)
print(f"{Color.YELLOW}Request statistics:{Color.RESET}")
print(TEXT_SEPARATOR)
df = pd.DataFrame(req_stats, columns=["request_tokens"])
print(df.describe(percentiles=percentiles).transpose())
print(TEXT_SEPARATOR)
def generate_conversations(
args: GenConvArgs, tokenizer: AutoTokenizer
) -> ConversationsMap:
# Text for all user prompts
# (text from the input text files will be appended to this line)
base_prompt_text = "Please rewrite the following text and add more content: "
base_prompt_token_count = len(
tokenizer.encode(base_prompt_text, add_special_tokens=False)
)
logger.info(f"{Color.PURPLE}Generating conversations...{Color.RESET}")
logger.info(args)
list_of_tokens = []
for filename in args.text_files:
# Load text file that will be used to generate prompts
with open(filename) as file:
data = file.read()
tokens_in_file = tokenizer.encode(data, add_special_tokens=False)
list_of_tokens.extend(tokens_in_file)
conversations: ConversationsMap = {}
conv_id = 0
# Generate number of turns for every conversation
turn_count: np.ndarray = args.input_num_turns.sample(args.num_conversations)
# Turn count should be at least 2 (one user prompt and one assistant answer)
turn_count = np.maximum(turn_count, 2)
# Round up to an even number (every user prompt should have an answer)
turn_count = turn_count + (turn_count % 2)
# Generate number of prefix tokens for every conversation
conv_prefix_tokens: np.ndarray = args.input_prefix_num_tokens.sample(
args.num_conversations
)
# Used to reduce shared text between conversations
# (jump/skip over text sections between conversations)
base_offset = 0
# Common prefix size for all conversations (only 1 sample required)
common_prefix_text = ""
common_prefix_tokens: int = args.input_common_prefix_num_tokens.sample(1)[0]
if common_prefix_tokens > 0:
# Using "." at the end to separate sentences
common_prefix_text = (
tokenizer.decode(list_of_tokens[: common_prefix_tokens - 2]) + "."
)
base_offset += common_prefix_tokens
for conv_id in range(args.num_conversations):
# Generate a single conversation
messages: MessagesList = []
nturns = turn_count[conv_id]
# User prompt token count per turn (with lower limit)
input_token_count: np.ndarray = args.input_num_tokens.sample(nturns)
input_token_count = np.maximum(input_token_count, base_prompt_token_count)
# Assistant answer token count per turn (with lower limit)
output_token_count: np.ndarray = args.output_num_tokens.sample(nturns)
output_token_count = np.maximum(output_token_count, 1)
user_turn = True
for turn_id in range(nturns):
if user_turn:
role = "user"
num_tokens = input_token_count[turn_id]
# Generate the user prompt,
# use a unique prefix (the conv_id) for each conversation
# (to avoid shared prefix between conversations)
content = f"{conv_id} is a nice number... "
if len(common_prefix_text) > 0 and turn_id == 0:
content = common_prefix_text + content
# Update the number of tokens left for the content
num_tokens -= len(tokenizer.encode(content, add_special_tokens=False))
if turn_id == 0:
prefix_num_tokens = conv_prefix_tokens[conv_id]
if prefix_num_tokens > 0:
# Add prefix text (context) to the first turn
start_offset = base_offset
end_offset = start_offset + prefix_num_tokens
assert len(list_of_tokens) > end_offset, (
"Not enough input text to generate "
f"{prefix_num_tokens} tokens for the "
f"prefix text ({start_offset=}, {end_offset=})"
)
content += f"{conv_id}, " + tokenizer.decode(
list_of_tokens[start_offset:end_offset]
)
base_offset += prefix_num_tokens
# Add the actual user prompt/question after the prefix text
content += base_prompt_text
num_tokens -= base_prompt_token_count
if num_tokens > 0:
# Add text from the input file (to reach the desired token count)
start_offset = base_offset + turn_id * input_token_count.max()
end_offset = start_offset + num_tokens
assert len(list_of_tokens) > end_offset, (
f"Not enough input text to generate {num_tokens} tokens "
f"for the prompt ({start_offset=}, {end_offset=})"
)
# Convert tokens back to text
content += tokenizer.decode(list_of_tokens[start_offset:end_offset])
else:
role = "assistant"
# This content will not be used as input to the LLM server
# (actual answers will be used instead).
# Content is only required to determine the min_tokens/max_tokens
# (inputs to the LLM server).
num_tokens = output_token_count[turn_id]
assert len(list_of_tokens) > num_tokens, (
f"Not enough input text to generate {num_tokens} "
"tokens for assistant content"
)
content = tokenizer.decode(list_of_tokens[:num_tokens])
# Append the user/assistant message to the list of messages
messages.append({"role": role, "content": content})
user_turn = not user_turn
# Add the new conversation
conversations[f"CONV_ID_{conv_id}"] = messages
# Increase base offset for the next conversation
base_offset += nturns
if args.print_stats:
print_conv_stats(conversations, tokenizer)
return conversations
def conversations_list_to_dict(input_list: ShareGptConversations) -> ConversationsMap:
conversations: ConversationsMap = {}
for item in input_list:
conv_id: str = item["id"]
assert isinstance(conv_id, str)
assert conv_id not in conversations, (
f"Conversation ID {conv_id} found more than once in the input"
)
messages: MessagesList = item["messages"]
assert isinstance(messages, list), (
f"Conversation messages should be a list (ID: {conv_id})"
)
assert len(messages) > 0, f"Conversation with no messages (ID: {conv_id})"
conversations[conv_id] = messages
logger.info(f"Using {len(conversations)} unique conversations (IDs)")
assert len(conversations) == len(input_list)
# Print statistics about the selected conversations
stats: list[dict[str, Any]] = []
for conv_data in conversations.values():
stats.append({"num_turns": len(conv_data)})
print(TEXT_SEPARATOR)
print(f"{Color.YELLOW}Conversations statistics:{Color.RESET}")
print(TEXT_SEPARATOR)
percentiles = [0.25, 0.5, 0.75, 0.9, 0.99, 0.999, 0.9999]
conv_stats = pd.DataFrame(stats).describe(percentiles=percentiles)
print(conv_stats.transpose())
print(TEXT_SEPARATOR)
return conversations
def conversations_dict_to_list(input_dict: ConversationsMap) -> ShareGptConversations:
output: ShareGptConversations = []
for conv_id, conv_data in input_dict.items():
new_item = {"id": conv_id, "messages": conv_data}
output.append(new_item)
return output

View File

@ -0,0 +1,28 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import logging
from enum import Enum
class Color(Enum):
RED = "\033[91m"
GREEN = "\033[92m"
BLUE = "\033[94m"
PURPLE = "\033[95m"
CYAN = "\033[96m"
YELLOW = "\033[93m"
RESET = "\033[0m"
def __str__(self):
return self.value
TEXT_SEPARATOR = "-" * 100
# Configure the logger
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] - %(message)s",
datefmt="%d-%m-%Y %H:%M:%S",
)
logger = logging.getLogger(__name__)

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,354 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Download dataset from:
https://huggingface.co/datasets/philschmid/sharegpt-raw/blob/main/sharegpt_20230401_clean_lang_split.json
Convert to OpenAI API:
export INPUT_FILE=sharegpt_20230401_clean_lang_split.json
python convert_sharegpt_to_openai.py $INPUT_FILE sharegpt_conv_128.json --max-items=128
"""
import argparse
import json
import random
from statistics import mean
from typing import Any, Optional
import pandas as pd # type: ignore
import tqdm # type: ignore
from transformers import AutoTokenizer # type: ignore
def has_non_english_chars(text: str) -> bool:
return not text.isascii()
def content_is_valid(
content: str, min_content_len: Optional[int], max_content_len: Optional[int]
) -> bool:
if min_content_len and len(content) < min_content_len:
return False
if max_content_len and len(content) > max_content_len:
return False
return has_non_english_chars(content)
def print_stats(
conversations: "list[dict[Any, Any]]", tokenizer: Optional[AutoTokenizer] = None
) -> None:
# Collect statistics
stats = []
print("\nCollecting statistics...")
for item in tqdm.tqdm(conversations):
# item has "id" and "messages"
messages = item["messages"]
user_turns = 0
assistant_turns = 0
user_words = 0
assistant_words = 0
conv_chars = 0
user_tokens: list[int] = []
assistant_tokens: list[int] = []
for m in messages:
content = m["content"]
conv_chars += len(content)
content_num_words = content.count(" ") + 1
num_tokens = 0
if tokenizer:
num_tokens = len(tokenizer(m["content"]).input_ids)
if m["role"] == "user":
user_turns += 1
user_words += content_num_words
if tokenizer:
user_tokens.append(num_tokens)
elif m["role"] == "assistant":
assistant_turns += 1
assistant_words += content_num_words
if tokenizer:
assistant_tokens.append(num_tokens)
# assert user_turns == assistant_turns, \
# f"Invalid conversation ID {item['id']}"
conv_words = user_words + assistant_words
item_stats = {
"user_turns": user_turns,
"assistant_turns": assistant_turns,
"user_words": user_words,
"assistant_words": assistant_words,
"conv_turns": len(messages),
"conv_words": conv_words,
"conv_characters": conv_chars,
}
if len(user_tokens) > 0:
item_stats["user_tokens"] = int(mean(user_tokens))
if len(assistant_tokens) > 0:
item_stats["assistant_tokens"] = int(mean(assistant_tokens))
stats.append(item_stats)
print("\nStatistics:")
percentiles = [0.25, 0.5, 0.75, 0.9, 0.99, 0.999, 0.9999]
df = pd.DataFrame(stats)
print(df.describe(percentiles=percentiles).transpose())
def convert_sharegpt_to_openai(
seed: int,
input_file: str,
output_file: str,
max_items: Optional[int],
min_content_len: Optional[int] = None,
max_content_len: Optional[int] = None,
min_turns: Optional[int] = None,
max_turns: Optional[int] = None,
model: Optional[str] = None,
) -> None:
if min_turns and max_turns:
assert min_turns <= max_turns
if min_content_len and max_content_len:
# Verify that min is not larger than max if both were given
assert min_content_len <= max_content_len
print(
f"Input parameters:\n{seed=}, {max_items=}, {min_content_len=},"
f" {max_content_len=}, {min_turns=}, {max_turns=}\n"
)
random.seed(seed)
tokenizer = None
if model is not None:
print(f"Loading tokenizer from: {model}")
tokenizer = AutoTokenizer.from_pretrained(model)
# Read the ShareGPT JSON file
print(f"Reading file: {input_file}")
with open(input_file, encoding="utf-8") as f:
# Should be a list of dicts
# Each dict should have "id" (string) and "conversations" (list of dicts)
sharegpt_data = json.load(f)
assert isinstance(sharegpt_data, list), "Input file should contain a list of dicts"
print(f"Total items in input file: {len(sharegpt_data):,}")
print(f"Shuffling dataset with seed {seed}")
random.shuffle(sharegpt_data)
# Map conversation ID to the all the messages
conversation_parts: dict[str, list[Any]] = {}
for item in tqdm.tqdm(sharegpt_data):
assert "id" in item, "Missing key 'id'"
assert "conversations" in item, "Missing key 'conversations'"
# Conversation ID (e.g: "hiWPlMD") and part/session (0, 1, 2, etc.)
conv_id, _ = item["id"].split("_")
new_turns = item["conversations"]
if conv_id not in conversation_parts:
# Start new conversation
conversation_parts[conv_id] = []
elif len(conversation_parts[conv_id]) > 0 and len(new_turns) > 0:
prev_turns = conversation_parts[conv_id][-1]
if prev_turns[-1]["from"] == new_turns[0]["from"]:
new_turns = new_turns[1:]
if len(new_turns) > 0:
# We assume that parts are in order in the ShareGPT dataset
conversation_parts[conv_id].append(new_turns)
dataset: list[dict[str, Any]] = []
for conv_id, conv_parts in conversation_parts.items():
new_item = {"id": conv_id}
conversations: list[dict[str, str]] = []
# Merge all parts
for conv_part in conv_parts:
conversations.extend(conv_part)
if len(conversations) > 0:
new_item["conversations"] = conversations
dataset.append(new_item)
print(f"Total unique conversations (IDs) in input file: {len(dataset):,}")
# Final output data
final_openai_dataset: list[dict] = []
# Filter conversations from the ShareGPT dataset and convert to OpenAI format
for item in tqdm.tqdm(dataset):
messages: list[dict] = []
assert "id" in item, "Missing key 'id'"
assert "conversations" in item, "Missing key 'conversations'"
conv_id = item["id"]
conversations = item["conversations"]
if min_turns is not None and len(conversations) < min_turns:
# Skip short conversations
continue
# Convert each message in the conversation, up to max_turns if specified
for i, turn in enumerate(conversations):
assert "from" in turn and "value" in turn, (
f"Invalid conversation ID {conv_id} - missing 'from' or 'value'"
)
role = None
turn_from = turn["from"]
if turn_from in {"human", "user"}:
role = "user"
elif turn_from in {"gpt", "bing", "chatgpt", "bard"}:
role = "assistant"
elif turn_from == "system":
role = "system"
assert role is not None, (
f"Invalid conversation ID {conv_id} - 'from'='{turn_from}' is invalid"
)
if i == 0 and role != "user":
# If the first message is from assistant (gpt), skip it.
# this happens when the conversation is a follow-up
# to a previous conversation (from the same user).
continue
if max_turns is not None and i >= max_turns:
break
# Convert message to OpenAI format (with "role" and "content")
content = turn["value"]
messages.append({"role": role, "content": content})
# Add the converted conversation to the OpenAI format
if len(messages) > 0:
valid_messages = True
# First turn should always be from the user
user_turn = True
for m in messages:
# Make sure that turns alternate between user and assistant
if (user_turn and m["role"] != "user") or (
not user_turn and m["role"] != "assistant"
):
valid_messages = False
break
user_turn = not user_turn
content = m["content"]
valid_messages = content_is_valid(
content, min_content_len, max_content_len
)
if not valid_messages:
break
if valid_messages is True:
final_openai_dataset.append({"id": conv_id, "messages": messages})
assert len(final_openai_dataset) > 0, "Final number of conversations is zero"
print_stats(final_openai_dataset)
print_stats_again = False
if max_items is not None and len(final_openai_dataset) > max_items:
print(f"\n\nSampling {max_items} items from the dataset...")
print_stats_again = True
final_openai_dataset = random.sample(final_openai_dataset, max_items)
if print_stats_again:
# Print stats after the dataset changed
print_stats(final_openai_dataset, tokenizer)
# Write the converted data to a new JSON file
final_size = len(final_openai_dataset)
print(f"\nTotal conversations converted (after filtering): {final_size:,}")
print(f"\nWriting file: {output_file}")
with open(output_file, "w", encoding="utf-8") as f:
json.dump(final_openai_dataset, f, ensure_ascii=False, indent=2)
def main() -> None:
parser = argparse.ArgumentParser(
description="Convert ShareGPT dataset to OpenAI API format"
)
parser.add_argument("input_file", help="Path to the input ShareGPT JSON file")
parser.add_argument(
"output_file", help="Path to the output OpenAI format JSON file"
)
parser.add_argument(
"--seed", type=int, default=0, help="Seed for random number generators"
)
parser.add_argument(
"--max-items",
type=int,
default=None,
help="Maximum number of items in the output file",
)
parser.add_argument(
"--min-turns",
type=int,
default=None,
help="Minimum number of turns per conversation",
)
parser.add_argument(
"--max-turns",
type=int,
default=None,
help="Maximum number of turns per conversation",
)
parser.add_argument(
"--min-content-len",
type=int,
default=None,
help="Min number of characters in the messages' content",
)
parser.add_argument(
"--max-content-len",
type=int,
default=None,
help="Max number of characters in the messages' content",
)
parser.add_argument(
"--model",
type=str,
default=None,
help="LLM model, only the tokenizer will be used",
)
args = parser.parse_args()
convert_sharegpt_to_openai(
args.seed,
args.input_file,
args.output_file,
args.max_items,
args.min_content_len,
args.max_content_len,
args.min_turns,
args.max_turns,
args.model,
)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,35 @@
{
"filetype": "generate_conversations",
"num_conversations": 24,
"text_files": ["pg1184.txt"],
"print_stats": false,
"prompt_input": {
"num_turns": {
"distribution": "uniform",
"min": 12,
"max": 18
},
"common_prefix_num_tokens": {
"distribution": "constant",
"value": 500
},
"prefix_num_tokens": {
"distribution": "lognormal",
"mean": 6,
"sigma": 4,
"max": 1500
},
"num_tokens": {
"distribution": "uniform",
"min": 120,
"max": 160
}
},
"prompt_output": {
"num_tokens": {
"distribution": "uniform",
"min": 80,
"max": 120
}
}
}

View File

@ -0,0 +1,5 @@
numpy>=1.24
pandas>=2.0.0
aiohttp>=3.10
transformers>=4.46
xlsxwriter>=3.2.1

View File

@ -19,7 +19,7 @@ else()
FetchContent_Declare(
flashmla
GIT_REPOSITORY https://github.com/vllm-project/FlashMLA.git
GIT_TAG 575f7724b9762f265bbee5889df9c7d630801845
GIT_TAG 0e43e774597682284358ff2c54530757b654b8d1
GIT_PROGRESS TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
@ -37,9 +37,9 @@ cuda_archs_loose_intersection(FLASH_MLA_ARCHS "9.0a" "${CUDA_ARCHS}")
if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.3 AND FLASH_MLA_ARCHS)
set(FlashMLA_SOURCES
${flashmla_SOURCE_DIR}/csrc/flash_api.cpp
${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_bf16_sm90.cu
${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_fp16_sm90.cu
${flashmla_SOURCE_DIR}/csrc/flash_fwd_mla_metadata.cu)
${flashmla_SOURCE_DIR}/csrc/kernels/splitkv_mla.cu
${flashmla_SOURCE_DIR}/csrc/kernels/mla_combine.cu
${flashmla_SOURCE_DIR}/csrc/kernels/get_mla_metadata.cu)
set(FlashMLA_INCLUDES
${flashmla_SOURCE_DIR}/csrc/cutlass/include

View File

@ -38,7 +38,7 @@ else()
FetchContent_Declare(
vllm-flash-attn
GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
GIT_TAG 1c2624e53c078854e0637ee566c72fe2107e75f4
GIT_TAG 57b4e68b9f9d94750b46de8f8dbd2bfcc86edd4f
GIT_PROGRESS TRUE
# Don't share the vllm-flash-attn build between build types
BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn

View File

@ -467,6 +467,12 @@ function (define_gpu_extension_target GPU_MOD_NAME)
if (GPU_LANGUAGE STREQUAL "HIP")
# Make this target dependent on the hipify preprocessor step.
add_dependencies(${GPU_MOD_NAME} hipify${GPU_MOD_NAME})
# Make sure we include the hipified versions of the headers, and avoid conflicts with the ones in the original source folder
target_include_directories(${GPU_MOD_NAME} PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/csrc
${GPU_INCLUDE_DIRECTORIES})
else()
target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
${GPU_INCLUDE_DIRECTORIES})
endif()
if (GPU_ARCHITECTURES)
@ -482,8 +488,6 @@ function (define_gpu_extension_target GPU_MOD_NAME)
target_compile_definitions(${GPU_MOD_NAME} PRIVATE
"-DTORCH_EXTENSION_NAME=${GPU_MOD_NAME}")
target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
${GPU_INCLUDE_DIRECTORIES})
target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${GPU_LIBRARIES})

View File

@ -128,6 +128,45 @@ __global__ void act_and_mul_kernel_with_param(
}
}
template <typename T>
__device__ __forceinline__ T swigluoai_and_mul(const T& gate, const T& up,
float alpha, float limit) {
// clamp gate: min=None, max=limit
const float gate_f = (float)gate;
const float clamped_gate = gate_f > limit ? limit : gate_f;
// clamp up: min=-limit, max=limit
const float up_f = (float)up;
const float clamped_up =
up_f > limit ? limit : (up_f < -limit ? -limit : up_f);
// glu = gate * sigmoid(gate * alpha)
const float sigmoid_val = 1.0f / (1.0f + expf(-clamped_gate * alpha));
const float glu = clamped_gate * sigmoid_val;
// (up + 1) * glu
return (T)((clamped_up + 1.0f) * glu);
}
template <typename scalar_t,
scalar_t (*ACT_FN)(const scalar_t&, const scalar_t&, const float,
const float)>
__global__ void swigluoai_and_mul_kernel(
scalar_t* __restrict__ out, // [..., d]
const scalar_t* __restrict__ input, // [..., 2, d]
const int d, const float alpha, const float limit) {
const int64_t token_idx = blockIdx.x;
// TODO: Vectorize loads and stores.
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
// gate = x[..., ::2] (even indices)
const scalar_t gate = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx]);
// up = x[..., 1::2] (odd indices)
const scalar_t up = VLLM_LDG(&input[token_idx * 2 * d + 2 * idx + 1]);
out[token_idx * d + idx] = ACT_FN(gate, up, alpha, limit);
}
}
} // namespace vllm
#define LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(KERNEL, PARAM) \
@ -145,11 +184,31 @@ __global__ void act_and_mul_kernel_with_param(
PARAM); \
});
#define LAUNCH_SIGLUOAI_AND_MUL(KERNEL, ALPHA, LIMIT) \
int d = input.size(-1) / 2; \
int64_t num_tokens = input.numel() / input.size(-1); \
dim3 grid(num_tokens); \
dim3 block(std::min(d, 1024)); \
const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
VLLM_DISPATCH_FLOATING_TYPES( \
input.scalar_type(), "clamp_swiglu_kernel_with_params", [&] { \
vllm::swigluoai_and_mul_kernel<scalar_t, KERNEL<scalar_t>> \
<<<grid, block, 0, stream>>>(out.data_ptr<scalar_t>(), \
input.data_ptr<scalar_t>(), d, ALPHA, \
LIMIT); \
});
void fatrelu_and_mul(torch::Tensor& out, // [..., d],
torch::Tensor& input, // [..., 2 * d]
double threshold) {
LAUNCH_ACTIVATION_GATE_KERNEL_WITH_PARAM(vllm::fatrelu_kernel, threshold);
}
void swigluoai_and_mul(torch::Tensor& out, // [..., d]
torch::Tensor& input, // [..., 2 * d]
double alpha, double limit) {
LAUNCH_SIGLUOAI_AND_MUL(vllm::swigluoai_and_mul, alpha, limit);
}
namespace vllm {
// Element-wise activation kernel template.

View File

@ -5,6 +5,7 @@
#include "cuda_utils.h"
#include "cuda_compat.h"
#include "dispatch_utils.h"
#include "quantization/vectorization_utils.cuh"
#ifdef USE_ROCM
#include "quantization/fp8/amd/quant_utils.cuh"
@ -261,14 +262,26 @@ __global__ void reshape_and_cache_kernel(
}
}
// Used by vectorization_utils to copy/convert one element
template <typename OutT, typename InT, Fp8KVCacheDataType kv_dt>
struct CopyWithScaleOp {
float scale;
__device__ __forceinline__ void operator()(OutT& dst, const InT src) const {
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
dst = static_cast<OutT>(src);
} else {
dst = fp8::scaled_convert<OutT, InT, kv_dt>(src, scale);
}
}
};
template <typename scalar_t, typename cache_t, Fp8KVCacheDataType kv_dt>
__global__ void reshape_and_cache_flash_kernel(
const scalar_t* __restrict__ key, // [num_tokens, num_heads, head_size]
const scalar_t* __restrict__ value, // [num_tokens, num_heads, head_size]
cache_t* __restrict__ key_cache, // [num_blocks, block_size, num_heads,
// head_size]
cache_t* __restrict__ value_cache, // [num_blocks, block_size, num_heads,
// head_size]
cache_t* __restrict__ key_cache, // NHD or HND, shape see comments below
cache_t* __restrict__ value_cache, // same above
const int64_t* __restrict__ slot_mapping, // [num_tokens]
const int64_t block_stride, const int64_t page_stride,
const int64_t head_stride, const int64_t key_stride,
@ -282,25 +295,58 @@ __global__ void reshape_and_cache_flash_kernel(
}
const int64_t block_idx = slot_idx / block_size;
const int64_t block_offset = slot_idx % block_size;
const int n = num_heads * head_size;
for (int i = threadIdx.x; i < n; i += blockDim.x) {
const int64_t src_key_idx = token_idx * key_stride + i;
const int64_t src_value_idx = token_idx * value_stride + i;
const int head_idx = i / head_size;
const int head_offset = i % head_size;
const int64_t tgt_key_value_idx = block_idx * block_stride +
block_offset * page_stride +
head_idx * head_stride + head_offset;
scalar_t tgt_key = key[src_key_idx];
scalar_t tgt_value = value[src_value_idx];
if constexpr (kv_dt == Fp8KVCacheDataType::kAuto) {
key_cache[tgt_key_value_idx] = tgt_key;
value_cache[tgt_key_value_idx] = tgt_value;
} else {
key_cache[tgt_key_value_idx] =
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_key, *k_scale);
value_cache[tgt_key_value_idx] =
fp8::scaled_convert<cache_t, scalar_t, kv_dt>(tgt_value, *v_scale);
const int n_elems = num_heads * head_size;
// pointers to the beginning of the source row for this token.
const scalar_t* __restrict__ key_src = key + token_idx * key_stride;
const scalar_t* __restrict__ value_src = value + token_idx * value_stride;
// find the start position inside the kv-cache for this token.
cache_t* __restrict__ key_dst =
key_cache + block_idx * block_stride + block_offset * page_stride;
cache_t* __restrict__ value_dst =
value_cache + block_idx * block_stride + block_offset * page_stride;
// this is true for the NHD layout where `head_stride == head_size`
const bool is_contiguous_heads = (head_stride == head_size);
float k_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *k_scale;
float v_scale_val = (kv_dt == Fp8KVCacheDataType::kAuto) ? 0.f : *v_scale;
constexpr int VEC_SIZE = (sizeof(scalar_t) == 2) ? 8 : 4;
CopyWithScaleOp<cache_t, scalar_t, kv_dt> k_op{k_scale_val};
CopyWithScaleOp<cache_t, scalar_t, kv_dt> v_op{v_scale_val};
if (is_contiguous_heads) {
// NHD layout
// kv cache: [num_blocks, block_size, num_heads, head_size]
vectorize_with_alignment<VEC_SIZE>(key_src, key_dst, n_elems, threadIdx.x,
blockDim.x, k_op);
vectorize_with_alignment<VEC_SIZE>(value_src, value_dst, n_elems,
threadIdx.x, blockDim.x, v_op);
} else {
// HND layout: heads are strided, but each head_size segment is contiguous
// kv cache: [num_blocks, num_heads, block_size, head_size]
const int lane = threadIdx.x & 31; // 0..31 within warp
const int warp_id = threadIdx.x >> 5; // warp index within block
const int warps_per_block = blockDim.x >> 5;
for (int head = warp_id; head < num_heads; head += warps_per_block) {
const scalar_t* __restrict__ k_src_h = key_src + head * head_size;
const scalar_t* __restrict__ v_src_h = value_src + head * head_size;
cache_t* __restrict__ k_dst_h =
key_dst + static_cast<int64_t>(head) * head_stride;
cache_t* __restrict__ v_dst_h =
value_dst + static_cast<int64_t>(head) * head_stride;
// within each head, let the 32 threads of the warp perform the vector
// copy
vectorize_with_alignment<VEC_SIZE>(k_src_h, k_dst_h, head_size, lane, 32,
k_op);
vectorize_with_alignment<VEC_SIZE>(v_src_h, v_dst_h, head_size, lane, 32,
v_op);
}
}
}

View File

@ -321,6 +321,8 @@ static inline constexpr auto kFE3M2f =
ScalarType::float_(3, 2, true, ScalarType::NAN_NONE);
static inline constexpr auto kFE4M3fn =
ScalarType::float_(4, 3, true, ScalarType::NAN_EXTD_RANGE_MAX_MIN);
static inline constexpr auto kFE8M0fnu =
ScalarType(8, 0, false, 0, true, ScalarType::NAN_EXTD_RANGE_MAX_MIN);
static inline constexpr auto kFE5M2 = ScalarType::float_IEEE754(5, 2);
static inline constexpr auto kFE8M7 = ScalarType::float_IEEE754(8, 7);
static inline constexpr auto kFE5M10 = ScalarType::float_IEEE754(5, 10);

View File

@ -16,12 +16,14 @@ struct KernelVecType<float> {
using cvt_vec_type = vec_op::FP32Vec16;
};
#if !defined(__aarch64__) || defined(ARM_BF16_SUPPORT)
template <>
struct KernelVecType<c10::BFloat16> {
using load_vec_type = vec_op::BF16Vec16;
using azp_adj_load_vec_type = vec_op::INT32Vec16;
using cvt_vec_type = vec_op::FP32Vec16;
};
#endif
template <>
struct KernelVecType<c10::Half> {

View File

@ -60,3 +60,13 @@ struct enable_sm100_only : Kernel {
#endif
}
};
template <typename Kernel>
struct enable_sm120_only : Kernel {
template <typename... Args>
CUTLASS_DEVICE void operator()(Args&&... args) {
#if defined __CUDA_ARCH__ && __CUDA_ARCH__ == 1200
Kernel::operator()(std::forward<Args>(args)...);
#endif
}
};

View File

@ -45,6 +45,9 @@ struct SSMParamsBase {
index_t out_d_stride;
index_t out_z_batch_stride;
index_t out_z_d_stride;
index_t ssm_states_batch_stride;
index_t ssm_states_dim_stride;
index_t ssm_states_dstate_stride;
// Common data pointers.
void *__restrict__ A_ptr;

View File

@ -132,7 +132,9 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
input_t *Bvar = reinterpret_cast<input_t *>(params.B_ptr) + sequence_start_index * params.B_batch_stride + group_id * params.B_group_stride;
weight_t *C = reinterpret_cast<weight_t *>(params.C_ptr) + dim_id * kNRows * params.C_d_stride;
input_t *Cvar = reinterpret_cast<input_t *>(params.C_ptr) + sequence_start_index * params.C_batch_stride + group_id * params.C_group_stride;
input_t *ssm_states = reinterpret_cast<input_t *>(params.ssm_states_ptr) + (cache_index * params.dim + dim_id * kNRows) * params.dstate;
input_t *ssm_states = reinterpret_cast<input_t *>(params.ssm_states_ptr) +
cache_index * params.ssm_states_batch_stride +
dim_id * kNRows * params.ssm_states_dim_stride;
float D_val[kNRows] = {0};
if (params.D_ptr != nullptr) {
@ -248,7 +250,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
}
// Initialize running total
scan_t running_prefix = chunk > 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float2(1.0, has_initial_state ? float(ssm_states[state_idx]): 0.0);
scan_t running_prefix = chunk > 0 ? smem_running_prefix[state_idx + r * MAX_DSTATE] : make_float2(1.0, has_initial_state ? float(ssm_states[state_idx * params.ssm_states_dstate_stride]): 0.0);
SSMScanPrefixCallbackOp<weight_t> prefix_op(running_prefix);
typename Ktraits::BlockScanT(smem_scan).InclusiveScan(
@ -259,7 +261,7 @@ void selective_scan_fwd_kernel(SSMParamsBase params) {
if (threadIdx.x == 0) {
smem_running_prefix[state_idx] = prefix_op.running_prefix;
if (chunk == n_chunks - 1) {
ssm_states[state_idx] = input_t(prefix_op.running_prefix.y);
ssm_states[state_idx * params.ssm_states_dstate_stride] = input_t(prefix_op.running_prefix.y);
}
}
#pragma unroll
@ -481,6 +483,10 @@ void set_ssm_params_fwd(SSMParamsBase &params,
params.out_batch_stride = out.stride(1);
params.out_d_stride = out.stride(0);
params.ssm_states_batch_stride = ssm_states.stride(0);
params.ssm_states_dim_stride = ssm_states.stride(1);
params.ssm_states_dstate_stride = ssm_states.stride(2);
}
else{
if (!is_variable_B) {
@ -509,6 +515,10 @@ void set_ssm_params_fwd(SSMParamsBase &params,
}
params.out_batch_stride = out.stride(0);
params.out_d_stride = out.stride(1);
params.ssm_states_batch_stride = ssm_states.stride(0);
params.ssm_states_dim_stride = ssm_states.stride(1);
params.ssm_states_dstate_stride = ssm_states.stride(2);
}
}

View File

@ -20,6 +20,7 @@ namespace MARLIN_NAMESPACE_NAME {
TEMPLATE = ("template __global__ void Marlin<"
"{{scalar_t}}, "
"{{w_type_id}}, "
"{{s_type_id}}, "
"{{threads}}, "
"{{thread_m_blocks}}, "
"{{thread_n_blocks}}, "
@ -77,6 +78,7 @@ def generate_new_kernels():
if scalar_type == "vllm::kFE4M3fn" and group_blocks not in [-1, 8]:
continue
# nvfp4 only supports group_size == 16
# mxfp4 only supports group_size == 32
if scalar_type == "vllm::kFE2M1f" and group_blocks not in [1, 2]:
continue
# other quantization methods don't support group_size = 16
@ -89,9 +91,22 @@ def generate_new_kernels():
c_dtype = "half" if dtype == "fp16" else "nv_bfloat16"
if scalar_type == "vllm::kFE2M1f" and group_blocks == 1:
s_type = "vllm::kFE4M3fn"
elif scalar_type == "vllm::kFE2M1f" and group_blocks == 2:
s_type = "vllm::kFE8M0fnu"
if dtype == "fp16":
# we cannot safely dequantize e8m0 to fp16, so skip this
continue
elif dtype == "fp16":
s_type = "vllm::kFloat16"
elif dtype == "bf16":
s_type = "vllm::kBFloat16"
template_str = jinja2.Template(TEMPLATE).render(
scalar_t=c_dtype,
w_type_id=scalar_type + ".id()",
s_type_id=s_type + ".id()",
threads=threads,
thread_m_blocks=max(m_blocks, 1),
thread_n_blocks=n_blocks,

View File

@ -7,23 +7,25 @@
#include "quantization/gptq_marlin/marlin_dtypes.cuh"
#include "core/scalar_type.hpp"
#define MARLIN_KERNEL_PARAMS \
const int4 *__restrict__ A, const int4 *__restrict__ B, \
int4 *__restrict__ C, int4 *__restrict__ C_tmp, \
const int4 *__restrict__ scales_ptr, \
const uint16_t *__restrict__ scale2_ptr, \
const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx, \
const int32_t *__restrict__ sorted_token_ids_ptr, \
const int32_t *__restrict__ expert_ids_ptr, \
const int32_t *__restrict__ num_tokens_past_padded_ptr, \
const float *__restrict__ topk_weights_ptr, int top_k, \
bool mul_topk_weights, bool is_ep, int num_groups, int prob_m, \
int prob_n, int prob_k, int *locks, bool use_atomic_add, \
#define MARLIN_KERNEL_PARAMS \
const int4 *__restrict__ A, const int4 *__restrict__ B, \
int4 *__restrict__ C, int4 *__restrict__ C_tmp, \
const int4 *__restrict__ b_bias_ptr, \
const int4 *__restrict__ scales_ptr, \
const uint16_t *__restrict__ scale2_ptr, \
const int4 *__restrict__ zp_ptr, const int *__restrict__ g_idx, \
const int32_t *__restrict__ sorted_token_ids_ptr, \
const int32_t *__restrict__ expert_ids_ptr, \
const int32_t *__restrict__ num_tokens_past_padded_ptr, \
const float *__restrict__ topk_weights_ptr, int top_k, \
bool mul_topk_weights, bool is_ep, int num_groups, int prob_m, \
int prob_n, int prob_k, int *locks, bool has_bias, bool use_atomic_add, \
bool use_fp32_reduce, int max_shared_mem
namespace MARLIN_NAMESPACE_NAME {
template <typename scalar_t, // compute dtype, half or nv_float16
const vllm::ScalarTypeId w_type_id, // weight ScalarType id
const vllm::ScalarTypeId s_type_id, // weight scale ScalarType id
const int threads, // number of threads in a threadblock
const int thread_m_blocks, // number of 16x16 blocks in the m
// dimension (batchsize) of the

View File

@ -280,6 +280,7 @@ __device__ inline void wait_negative_and_add(int* lock) {
template <typename scalar_t, // compute dtype, half or nv_float16
const vllm::ScalarTypeId w_type_id, // weight ScalarType id
const vllm::ScalarTypeId s_type_id, // weight scale ScalarType id
const int threads, // number of threads in a threadblock
const int thread_m_blocks, // number of 16x16 blocks in the m
// dimension (batchsize) of the
@ -299,6 +300,7 @@ __global__ void Marlin(
const int4* __restrict__ B, // 4bit quantized weight matrix of shape kxn
int4* __restrict__ C, // fp16 output buffer of shape mxn
int4* __restrict__ C_tmp, // fp32 tmp output buffer (for reduce)
const int4* __restrict__ b_bias_ptr,
const int4* __restrict__ scales_ptr, // fp16 quantization scales of shape
// (k/groupsize)xn
const uint16_t* __restrict__ scale2_ptr, // fp16 global scale (for nvfp4
@ -318,8 +320,9 @@ __global__ void Marlin(
int prob_n, // output dimension n
int prob_k, // reduction dimension k
int* locks, // extra global storage for barrier synchronization
bool use_atomic_add, // whether to use atomic add to reduce
bool use_fp32_reduce, // whether to use fp32 global reduce
bool has_bias,
bool use_atomic_add, // whether to use atomic add to reduce
bool use_fp32_reduce, // whether to use fp32 global reduce
int max_shared_mem) {
// Each threadblock processes one "stripe" of the B matrix with (roughly) the
// same size, which might involve multiple column "slices" (of width 16 *
@ -342,12 +345,23 @@ __global__ void Marlin(
extern __shared__ int4 sh[];
static constexpr auto w_type = vllm::ScalarType::from_id(w_type_id);
static constexpr auto s_type = vllm::ScalarType::from_id(s_type_id);
if constexpr (w_type == vllm::kFE2M1f) {
static_assert(s_type == vllm::kFE4M3fn && group_blocks == 1 ||
s_type == vllm::kFE8M0fnu && group_blocks == 2);
} else if constexpr (std::is_same<scalar_t, nv_bfloat16>::value) {
static_assert(s_type == vllm::kBFloat16);
} else if constexpr (std::is_same<scalar_t, half>::value) {
static_assert(s_type == vllm::kFloat16);
}
constexpr bool has_zp = w_type == vllm::kU4 || w_type == vllm::kU8;
constexpr bool is_int_type = w_type == vllm::kU4 || w_type == vllm::kU8 ||
w_type == vllm::kU4B8 || w_type == vllm::kU8B128;
// see comments of dequant.h for more details
constexpr bool dequant_skip_flop =
!is_int_type ||
w_type == vllm::kFE4M3fn ||
w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn ||
has_zp && !is_zp_float && !std::is_same<scalar_t, nv_bfloat16>::value ||
has_zp && !is_zp_float && !(w_type == vllm::kU8);
@ -365,6 +379,7 @@ __global__ void Marlin(
const int zp_expert_stride =
is_zp_float ? prob_n * prob_k / group_size / 8
: prob_n * prob_k / group_size / (pack_factor * 4);
const int b_bias_expert_stride = prob_n / 8;
// parallel: num valid moe blocks
int num_tokens_past_padded = num_tokens_past_padded_ptr[0];
@ -475,7 +490,7 @@ __global__ void Marlin(
for (int i = 0; i < 4; i++) {
int idx = tid4 * 4 + i;
idx = idx < block_num_valid_tokens ? idx : 0;
if constexpr (w_type == vllm::kFE2M1f) {
if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
sh_block_topk_weights[idx] = __hmul2(
global_scale, Dtype::num2num2(Dtype::float2num(
topk_weights_ptr[sh_block_sorted_ids[idx]])));
@ -513,7 +528,7 @@ __global__ void Marlin(
expert_id = expert_ids_ptr[block_id];
}
if constexpr (w_type == vllm::kFE2M1f) {
if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
uint16_t val = scale2_ptr[expert_id];
global_scale = Dtype::num2num2(*reinterpret_cast<scalar_t*>(&val));
}
@ -526,6 +541,9 @@ __global__ void Marlin(
if constexpr (has_act_order) {
g_idx += (expert_id - old_expert_id) * prob_k;
}
if (has_bias) {
b_bias_ptr += (expert_id - old_expert_id) * b_bias_expert_stride;
}
read_moe_block_data(block_id);
};
@ -721,7 +739,7 @@ __global__ void Marlin(
s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
(threadIdx.x % 32) / 4;
s_sh_rd = s_sh_rd * 2 + warp_row % 2;
s_sh_rd = s_sh_rd * 2 + (warp_row / group_blocks) % 2;
} else if constexpr (group_blocks != -1)
s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
@ -734,6 +752,18 @@ __global__ void Marlin(
s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
(threadIdx.x % 32) % 4;
int bias_sh_rd;
if constexpr (m_block_size_8) {
bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
(threadIdx.x % 32) / 8;
} else {
bias_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
(threadIdx.x % 32) % 4;
}
int bias_sh_wr = threadIdx.x;
int bias_gl_rd = (thread_n_blocks * 16 / 8) * slice_col + threadIdx.x;
// Zero-points have the same read layout as the scales
// (without column-wise case)
constexpr int num_col_threads = 8;
@ -793,7 +823,19 @@ __global__ void Marlin(
constexpr int sh_b_size = stages * b_sh_stage;
int4* sh_b = sh_new;
int4* sh_red = sh_new;
int4* sh_g_idx = sh_b + (sh_red_size > sh_b_size ? sh_red_size : sh_b_size);
constexpr int sh_size_b_red_min =
(sh_red_size < sh_b_size ? sh_red_size : sh_b_size);
constexpr int sh_size_b_red_max =
(sh_red_size > sh_b_size ? sh_red_size : sh_b_size);
constexpr int sh_bias_size = (thread_n_blocks * 16 / 8);
constexpr int sh_b_red_bias_size =
sh_size_b_red_max > (sh_size_b_red_min + sh_bias_size)
? sh_size_b_red_max
: (sh_size_b_red_min + sh_bias_size);
int4* sh_bias = sh_new + sh_size_b_red_min;
int4* sh_g_idx = sh_new + sh_b_red_bias_size;
int4* sh_zp = sh_g_idx + (stages * g_idx_stage);
constexpr int sh_s_size = has_act_order ? (act_s_max_num_groups * s_sh_stride)
: (stages * s_sh_stage);
@ -803,9 +845,9 @@ __global__ void Marlin(
static_assert(thread_m_blocks * 16 * thread_n_blocks * 16 / 8 <=
stages * b_sh_stage);
int4* sh_a = sh_s + sh_s_size;
constexpr int shm_size_used =
moe_block_size + stages * (g_idx_stage + zp_sh_stage) + sh_s_size +
(sh_red_size > sh_b_size ? sh_red_size : sh_b_size);
constexpr int shm_size_used = moe_block_size +
stages * (g_idx_stage + zp_sh_stage) +
sh_s_size + sh_b_red_bias_size;
// all remaining shared memory is used to cache A (input)
// sh_a_max_row is at least ` stages * 16 * thread_m_blocks `
@ -816,7 +858,8 @@ __global__ void Marlin(
FragA frag_a[2][thread_m_blocks];
I4 frag_b_quant[2][b_thread_vecs];
FragC frag_c[thread_m_blocks][4][2];
FragS frag_s[2][4]; // No act-order
FragS frag_s[2][4]; // No act-order
FragS frag_bias[2][4];
FragS act_frag_s[2][4][4]; // For act-order
int frag_qzp[2][num_ints_per_thread]; // Zero-points
FragZP frag_zp; // Zero-points in fp16
@ -1065,10 +1108,15 @@ __global__ void Marlin(
if constexpr (w_type_id != vllm::kFE2M1f.id()) {
reinterpret_cast<int4*>(&frag_s[k % 2])[0] =
sh_s_stage[s_sh_rd + cur_group_id * s_sh_stride];
} else {
} else if constexpr (group_blocks == 1 || thread_k_blocks > 4) {
reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
reinterpret_cast<int2*>(
sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride)];
} else {
reinterpret_cast<int2*>(&frag_s[k % 2])[0] =
reinterpret_cast<int2*>(
sh_s_stage)[s_sh_rd + cur_group_id * (2 * s_sh_stride) +
k % 2];
}
}
}
@ -1281,9 +1329,9 @@ __global__ void Marlin(
int s_quant_0 = reinterpret_cast<int*>(frag_s[k2])[0];
int s_quant_1 = reinterpret_cast<int*>(frag_s[k2])[1];
dequant_fp8_scales<scalar_t2>(s_quant_0,
reinterpret_cast<scalar_t2*>(&frag_s[k2]));
dequant_fp8_scales<scalar_t2>(
dequant_fp8_scales<scalar_t2, s_type_id>(
s_quant_0, reinterpret_cast<scalar_t2*>(&frag_s[k2]));
dequant_fp8_scales<scalar_t2, s_type_id>(
s_quant_1, reinterpret_cast<scalar_t2*>(&frag_s[k2]) + 2);
}
@ -1566,7 +1614,7 @@ __global__ void Marlin(
// Write out the reduce final result in the correct layout. We only actually
// reshuffle matrix fragments in this step, the reduction above is performed
// in fragment layout.
auto write_result = [&]() {
auto write_result = [&](bool last) {
int c_gl_stride = prob_n / 8;
constexpr int c_sh_stride = 2 * thread_n_blocks + 1;
int c_gl_wr_delta = c_gl_stride * (threads / (2 * thread_n_blocks));
@ -1592,7 +1640,7 @@ __global__ void Marlin(
// We first reorder in shared memory to guarantee the most efficient final
// global write patterns
auto write = [&](int idx, float c0, float c1, FragS& s) {
auto write = [&](int idx, float c0, float c1, FragS& s, FragS& b_bias) {
scalar_t2 res =
Dtype::nums2num2(Dtype::float2num(c0), Dtype::float2num(c1));
@ -1601,14 +1649,27 @@ __global__ void Marlin(
if constexpr (!has_act_order && group_blocks == -1 &&
w_type.size_bits() == 4 &&
(has_zp && dequant_skip_flop || !has_zp)) {
res = __hmul2(res, s[0]);
scalar_t2 tmp_scale = s[0];
if constexpr (m_block_size_8) {
tmp_scale = Dtype::num2num2(
reinterpret_cast<scalar_t*>(&s[0])[(threadIdx.x % 8) / 4]);
}
res = __hmul2(res, tmp_scale);
}
if constexpr (w_type == vllm::kFE2M1f) {
if constexpr (w_type == vllm::kFE2M1f && s_type == vllm::kFE4M3fn) {
if (!mul_topk_weights) {
res = __hmul2(res, global_scale);
}
}
if (has_bias && last) {
scalar_t2 tmp_bias = b_bias[0];
if constexpr (m_block_size_8) {
tmp_bias = Dtype::num2num2(
reinterpret_cast<scalar_t*>(&b_bias[0])[(threadIdx.x % 8) / 4]);
}
res = __hadd2(res, tmp_bias);
}
if constexpr (m_block_size_8) {
((scalar_t*)sh_red)[idx] = res.x;
@ -1626,19 +1687,25 @@ __global__ void Marlin(
if constexpr (m_block_size_8) {
int wr = c_sh_wr + 16 * j;
write(wr, frag_c[i][j][0][0], frag_c[i][j][0][1],
frag_s[j / 2][2 * (j % 2) + 0]);
frag_s[j / 2][2 * (j % 2) + 0],
frag_bias[j / 2][2 * (j % 2) + 0]);
write(wr + 8, frag_c[i][j][0][2], frag_c[i][j][0][3],
frag_s[j / 2][2 * (j % 2) + 1]);
frag_s[j / 2][2 * (j % 2) + 1],
frag_bias[j / 2][2 * (j % 2) + 1]);
} else {
int wr = c_sh_wr + 8 * j;
write(wr + (4 * c_sh_stride) * 0 + 0, frag_c[i][j][0][0],
frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0]);
frag_c[i][j][0][1], frag_s[j / 2][2 * (j % 2) + 0],
frag_bias[j / 2][2 * (j % 2) + 0]);
write(wr + (4 * c_sh_stride) * 8 + 0, frag_c[i][j][0][2],
frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0]);
frag_c[i][j][0][3], frag_s[j / 2][2 * (j % 2) + 0],
frag_bias[j / 2][2 * (j % 2) + 0]);
write(wr + (4 * c_sh_stride) * 0 + 4, frag_c[i][j][1][0],
frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1]);
frag_c[i][j][1][1], frag_s[j / 2][2 * (j % 2) + 1],
frag_bias[j / 2][2 * (j % 2) + 1]);
write(wr + (4 * c_sh_stride) * 8 + 4, frag_c[i][j][1][2],
frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1]);
frag_c[i][j][1][3], frag_s[j / 2][2 * (j % 2) + 1],
frag_bias[j / 2][2 * (j % 2) + 1]);
}
}
c_sh_wr += 16 * (4 * c_sh_stride);
@ -1805,6 +1872,14 @@ __global__ void Marlin(
}
thread_block_reduce();
if (has_bias && last) {
__syncthreads();
cp_async4_pred(&sh_bias[bias_sh_wr], &b_bias_ptr[bias_gl_rd],
threadIdx.x < 16 * thread_n_blocks / 8);
cp_async_fence();
}
if constexpr (!has_act_order && group_blocks == -1 &&
(has_zp && dequant_skip_flop || !has_zp)) {
if (w_type.size_bits() == 8 || (last || use_atomic_add)) {
@ -1867,11 +1942,20 @@ __global__ void Marlin(
}
barrier_release(&locks[locks_off], last);
}
if (has_bias && last) {
cp_async_wait<0>();
__syncthreads();
reinterpret_cast<int4*>(&frag_bias)[0] = sh_bias[bias_sh_rd];
reinterpret_cast<int4*>(&frag_bias)[1] = sh_bias[bias_sh_rd + 4];
__syncthreads();
}
if (use_atomic_add && slice_count > 1 && slice_idx != 0)
wait_negative_and_add(&locks[locks_off]);
if (last || use_atomic_add)
// only the last block in a slice actually writes the result
write_result();
write_result(last);
int old_slice_row = slice_row;
slice_row = 0;
slice_col_par++;
@ -1904,6 +1988,7 @@ __global__ void Marlin(
for (int i = 0; i < b_sh_wr_iters; i++) B_ptr[i] -= b_gl_stride;
}
bias_gl_rd = (thread_n_blocks * 16 / 8) * slice_col + threadIdx.x;
// Update slice k/n for scales loading
if constexpr (has_act_order) {
slice_k_start = tb_k * slice_row;

View File

@ -51,8 +51,9 @@ __global__ void permute_cols_kernel(
} // namespace marlin
torch::Tensor moe_wna16_marlin_gemm(
torch::Tensor& a, std::optional<torch::Tensor> const& c_or_none,
torch::Tensor& b_q_weight, torch::Tensor& b_scales,
torch::Tensor& a, std::optional<torch::Tensor> c_or_none,
torch::Tensor& b_q_weight,
std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
std::optional<torch::Tensor> const& b_zeros_or_none,
std::optional<torch::Tensor> const& g_idx_or_none,
std::optional<torch::Tensor> const& perm_or_none, torch::Tensor& workspace,
@ -212,7 +213,7 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
// Get B size
int tb_k = th_config.thread_k;
int tb_n = th_config.thread_n;
int tb_m = thread_m_blocks * (m_block_size_8 ? 8 : 16);
int tb_m = thread_m_blocks * 16;
// shm size for block_sorted_ids/rd_block_sorted_ids/block_topk_weights
// both of them requires tb_m * 4 bytes (tb_m * int32 or tb_m * float32)
@ -220,6 +221,11 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
int sh_a_size = pipe_stages * (tb_m * tb_k) * 2;
int sh_b_size = pipe_stages * (tb_k * tb_n / pack_factor) * 4;
int sh_red_size = tb_m * (tb_n + 8) * 2;
int sh_bias_size = tb_n * 2;
int tmp_size =
(sh_b_size > sh_red_size ? sh_red_size : sh_b_size) + sh_bias_size;
tmp_size = max(max(sh_b_size, sh_red_size), tmp_size);
int sh_s_size =
get_scales_cache_size(th_config, prob_m, prob_n, prob_k, num_bits,
group_size, has_act_order, is_k_full);
@ -234,8 +240,8 @@ int get_kernel_cache_size(thread_config_t const& th_config, bool m_block_size_8,
sh_zp_size = sh_s_size / 2;
}
int total_size = max(sh_b_size, sh_red_size) + sh_a_size + sh_s_size +
sh_zp_size + sh_g_idx_size + sh_block_meta_size;
int total_size = tmp_size + sh_a_size + sh_s_size + sh_zp_size +
sh_g_idx_size + sh_block_meta_size;
return total_size;
}
@ -270,20 +276,25 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
int cache_size = get_kernel_cache_size(
th_config, m_block_size_8, thread_m_blocks, prob_m, prob_n, prob_k,
num_bits, group_size, has_act_order, is_k_full, has_zp, is_zp_float);
return cache_size <= max_shared_mem;
return cache_size + 512 <= max_shared_mem;
}
#define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT) \
else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS && \
thread_n_blocks == THREAD_N_BLOCKS && \
thread_k_blocks == THREAD_K_BLOCKS && \
m_block_size_8 == M_BLOCK_SIZE_8 && \
group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS && \
is_zp_float == IS_ZP_FLOAT) { \
kernel = Marlin<scalar_t, W_TYPE.id(), NUM_THREADS, THREAD_M_BLOCKS, \
THREAD_N_BLOCKS, THREAD_K_BLOCKS, M_BLOCK_SIZE_8, \
pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>; \
#define _GET_IF(W_TYPE, THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
M_BLOCK_SIZE_8, GROUP_BLOCKS, NUM_THREADS, IS_ZP_FLOAT) \
else if (q_type == W_TYPE && thread_m_blocks == THREAD_M_BLOCKS && \
thread_n_blocks == THREAD_N_BLOCKS && \
thread_k_blocks == THREAD_K_BLOCKS && \
m_block_size_8 == M_BLOCK_SIZE_8 && \
group_blocks == GROUP_BLOCKS && num_threads == NUM_THREADS && \
is_zp_float == IS_ZP_FLOAT) { \
constexpr auto S_TYPE = \
W_TYPE == vllm::kFE2M1f \
? (GROUP_BLOCKS == 1 ? vllm::kFE4M3fn : vllm::kFE8M0fnu) \
: (std::is_same<scalar_t, half>::value ? vllm::kFloat16 \
: vllm::kBFloat16); \
kernel = Marlin<scalar_t, W_TYPE.id(), S_TYPE.id(), NUM_THREADS, \
THREAD_M_BLOCKS, THREAD_N_BLOCKS, THREAD_K_BLOCKS, \
M_BLOCK_SIZE_8, pipe_stages, GROUP_BLOCKS, IS_ZP_FLOAT>; \
}
// COMMON: cases for (group_blocks in [-1, 2, 4, 8] and is_zp_float == false)
@ -335,31 +346,45 @@ bool is_valid_config(thread_config_t const& th_config, bool m_block_size_8,
_GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false) \
_GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false) \
\
_GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, -1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 8, NUM_THREADS, false)
#define FP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
#define FP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
#define FP4_GET_IF(W_TYPE) \
FP4_GET_IF_M1(W_TYPE, 8, 8, 256) \
FP4_GET_IF_M1(W_TYPE, 8, 4, 128) \
FP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
FP4_GET_IF_M234(W_TYPE, 8, 4, 128)
#define BIGGROUP_GET_IF(W_TYPE) \
BIGGROUP_GET_IF_M1(W_TYPE, 8, 8, 256) \
BIGGROUP_GET_IF_M1(W_TYPE, 8, 4, 128) \
BIGGROUP_GET_IF_M234(W_TYPE, 16, 4, 256) \
BIGGROUP_GET_IF_M234(W_TYPE, 8, 4, 128)
#define NVFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
#define NVFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false) \
_GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 1, NUM_THREADS, false)
#define NVFP4_GET_IF(W_TYPE) \
NVFP4_GET_IF_M1(W_TYPE, 8, 8, 256) \
NVFP4_GET_IF_M1(W_TYPE, 8, 4, 128) \
NVFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
NVFP4_GET_IF_M234(W_TYPE, 8, 4, 128)
#define MXFP4_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 2, NUM_THREADS, false) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
#define MXFP4_GET_IF_M234(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 2, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
_GET_IF(W_TYPE, 3, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false) \
_GET_IF(W_TYPE, 4, N_BLOCKS, K_BLOCKS, false, 2, NUM_THREADS, false)
#define MXFP4_GET_IF(W_TYPE) \
MXFP4_GET_IF_M1(W_TYPE, 8, 8, 256) \
MXFP4_GET_IF_M1(W_TYPE, 8, 4, 128) \
MXFP4_GET_IF_M234(W_TYPE, 16, 4, 256) \
MXFP4_GET_IF_M234(W_TYPE, 8, 4, 128)
// We currently have 4-bit models only with group_blocks == 4
#define FZP_GET_IF_M1(W_TYPE, N_BLOCKS, K_BLOCKS, NUM_THREADS) \
_GET_IF(W_TYPE, 1, N_BLOCKS, K_BLOCKS, true, 4, NUM_THREADS, true) \
@ -408,12 +433,17 @@ MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
COMMON_GET_IF(vllm::kU4B8)
COMMON_GET_IF(vllm::kU8B128)
BIGGROUP_GET_IF(vllm::kFE4M3fn)
NVFP4_GET_IF(vllm::kFE2M1f)
FP4_GET_IF(vllm::kFE2M1f)
BIGGROUP_GET_IF(vllm::kFE4M3fn)
ACT_GET_IF(vllm::kU4B8)
ACT_GET_IF(vllm::kU8B128)
if (std::is_same<scalar_t, nv_bfloat16>::value) {
if (false) {
}
MXFP4_GET_IF(vllm::kFE2M1f)
}
return kernel;
}
@ -482,16 +512,16 @@ exec_config_t determine_exec_config(const vllm::ScalarType& q_type, int prob_m,
}
template <typename scalar_t>
void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
void* s2, void* zp, void* g_idx, void* perm, void* a_tmp,
void* sorted_token_ids, void* expert_ids,
void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* b_bias,
void* s, void* s2, void* zp, void* g_idx, void* perm,
void* a_tmp, void* sorted_token_ids, void* expert_ids,
void* num_tokens_past_padded, void* topk_weights,
int moe_block_size, int top_k, bool mul_topk_weights, bool is_ep,
int prob_m, int prob_n, int prob_k, void* workspace,
vllm::ScalarType const& q_type, bool has_act_order,
bool is_k_full, bool has_zp, int num_groups, int group_size,
int dev, cudaStream_t stream, int thread_k, int thread_n,
int sms, bool use_atomic_add, bool use_fp32_reduce,
vllm::ScalarType const& q_type, bool has_bias,
bool has_act_order, bool is_k_full, bool has_zp, int num_groups,
int group_size, int dev, cudaStream_t stream, int thread_k,
int thread_n, int sms, bool use_atomic_add, bool use_fp32_reduce,
bool is_zp_float) {
int thread_m_blocks = div_ceil(moe_block_size, 16);
bool m_block_size_8 = moe_block_size == 8;
@ -538,6 +568,7 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
const int4* B_ptr = (const int4*)B;
int4* C_ptr = (int4*)C;
int4* C_tmp_ptr = (int4*)C_tmp;
const int4* bias_ptr = (const int4*)b_bias;
const int4* s_ptr = (const int4*)s;
const uint16_t* s2_ptr = (const uint16_t*)s2;
const int4* zp_ptr = (const int4*)zp;
@ -648,10 +679,10 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
// avoid ">>>" being formatted to "> > >"
// clang-format off
kernel<<<blocks, num_threads, max_shared_mem, stream>>>(
A_ptr, B_ptr, C_ptr, C_tmp_ptr, s_ptr, s2_ptr, zp_ptr, g_idx_ptr,
A_ptr, B_ptr, C_ptr, C_tmp_ptr, bias_ptr, s_ptr, s2_ptr, zp_ptr, g_idx_ptr,
sorted_token_ids_ptr, expert_ids_ptr, num_tokens_past_padded_ptr,
topk_weights_ptr, top_k, mul_topk_weights, is_ep, num_groups, prob_m,
prob_n, prob_k, locks, use_atomic_add, use_fp32_reduce, max_shared_mem);
prob_n, prob_k, locks, has_bias, use_atomic_add, use_fp32_reduce, max_shared_mem);
// clang-format on
}
@ -659,7 +690,8 @@ void marlin_mm(const void* A, const void* B, void* C, void* C_tmp, void* s,
torch::Tensor moe_wna16_marlin_gemm(
torch::Tensor& a, std::optional<torch::Tensor> const& c_or_none,
torch::Tensor& b_q_weight, torch::Tensor& b_scales,
torch::Tensor& b_q_weight,
std::optional<torch::Tensor> const& b_bias_or_none, torch::Tensor& b_scales,
std::optional<torch::Tensor> const& global_scale_or_none,
std::optional<torch::Tensor> const& b_zeros_or_none,
std::optional<torch::Tensor> const& g_idx_or_none,
@ -766,7 +798,6 @@ torch::Tensor moe_wna16_marlin_gemm(
num_groups = b_scales.size(1);
torch::Tensor g_idx, perm, a_tmp;
;
if (g_idx_or_none.has_value() && perm_or_none.has_value()) {
g_idx = g_idx_or_none.value();
perm = perm_or_none.value();
@ -815,12 +846,24 @@ torch::Tensor moe_wna16_marlin_gemm(
torch::Tensor global_scale;
if (global_scale_or_none.has_value()) {
global_scale = global_scale_or_none.value();
TORCH_CHECK(b_q_type == vllm::kFE2M1f,
"global_scale can only be used for float4_e2m1f.");
TORCH_CHECK(b_q_type == vllm::kFE2M1f && group_size == 16,
"global_scale can only be used for nvfp4 format.");
} else {
global_scale = torch::empty({0}, options);
TORCH_CHECK(!(b_q_type == vllm::kFE2M1f),
"the global_scale parameter must be passed for float4_e2m1f.");
TORCH_CHECK(!(b_q_type == vllm::kFE2M1f && group_size == 16),
"the global_scale parameter must be passed for nvfp4 format.");
}
bool has_bias = b_bias_or_none.has_value();
torch::Tensor b_bias;
if (has_bias) {
b_bias = b_bias_or_none.value();
TORCH_CHECK(b_bias.device().is_cuda(), "b_bias is not on GPU");
TORCH_CHECK(b_bias.is_contiguous(), "b_bias is not contiguous");
TORCH_CHECK(b_bias.size(1) == size_n, "b_bias.size(0) != size_n");
TORCH_CHECK(b_bias.stride(1) == 1, "b_bias.stride(1) != 1");
} else {
b_bias = torch::empty({0}, options);
}
torch::Tensor b_zeros;
@ -832,7 +875,6 @@ torch::Tensor moe_wna16_marlin_gemm(
b_zeros = torch::empty({0}, options);
}
bool has_zp = b_zeros.size(-1) > 0;
if (has_zp) {
TORCH_CHECK(
b_q_type == vllm::kU4 || b_q_type == vllm::kU8,
@ -890,41 +932,58 @@ torch::Tensor moe_wna16_marlin_gemm(
if (a.scalar_type() == at::ScalarType::Half) {
void* scales_ptr;
if (b_q_type == vllm::kFE2M1f) {
scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
if (group_size == 16)
scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
else if (group_size == 32)
scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
else
TORCH_CHECK(false,
"float4_e2m1f only supports group_size == 16 (NVFP4) ",
"and group_size == 32 (MXFP4)");
} else {
scales_ptr = b_scales.data_ptr<at::Half>();
}
MARLIN_NAMESPACE_NAME::marlin_mm<half>(
a.data_ptr<at::Half>(), b_q_weight.data_ptr(), c.data_ptr<at::Half>(),
c_tmp.data_ptr<float>(), scales_ptr, global_scale.data_ptr<at::Half>(),
b_zeros.data_ptr(), g_idx.data_ptr(), perm.data_ptr(),
a_tmp.data_ptr<at::Half>(), sorted_token_ids.data_ptr(),
expert_ids.data_ptr(), num_tokens_past_padded.data_ptr(),
topk_weights.data_ptr(), moe_block_size, top_k, mul_topk_weights, is_ep,
size_m, size_n, size_k, workspace.data_ptr(), b_q_type, has_act_order,
is_k_full, has_zp, num_groups, group_size, dev,
c_tmp.data_ptr<float>(), b_bias.data_ptr<at::Half>(), scales_ptr,
global_scale.data_ptr<at::Half>(), b_zeros.data_ptr(), g_idx.data_ptr(),
perm.data_ptr(), a_tmp.data_ptr<at::Half>(),
sorted_token_ids.data_ptr(), expert_ids.data_ptr(),
num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(),
moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k,
workspace.data_ptr(), b_q_type, has_bias, has_act_order, is_k_full,
has_zp, num_groups, group_size, dev,
at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
use_atomic_add, use_fp32_reduce, is_zp_float);
} else if (a.scalar_type() == at::ScalarType::BFloat16) {
void* scales_ptr;
if (b_q_type == vllm::kFE2M1f) {
scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
if (group_size == 16)
scales_ptr = b_scales.data_ptr<at::Float8_e4m3fn>();
else if (group_size == 32)
scales_ptr = b_scales.data_ptr<at::Float8_e8m0fnu>();
else
TORCH_CHECK(false,
"float4_e2m1f only supports group_size == 16 (NVFP4) ",
"and group_size == 32 (MXFP4)");
} else {
scales_ptr = b_scales.data_ptr<at::BFloat16>();
}
MARLIN_NAMESPACE_NAME::marlin_mm<nv_bfloat16>(
a.data_ptr<at::BFloat16>(), b_q_weight.data_ptr(),
c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(), scales_ptr,
c.data_ptr<at::BFloat16>(), c_tmp.data_ptr<float>(),
b_bias.data_ptr<at::BFloat16>(), scales_ptr,
global_scale.data_ptr<at::BFloat16>(), b_zeros.data_ptr(),
g_idx.data_ptr(), perm.data_ptr(), a_tmp.data_ptr<at::BFloat16>(),
sorted_token_ids.data_ptr(), expert_ids.data_ptr(),
num_tokens_past_padded.data_ptr(), topk_weights.data_ptr(),
moe_block_size, top_k, mul_topk_weights, is_ep, size_m, size_n, size_k,
workspace.data_ptr(), b_q_type, has_act_order, is_k_full, has_zp,
num_groups, group_size, dev, at::cuda::getCurrentCUDAStream(dev),
thread_k, thread_n, sms, use_atomic_add, use_fp32_reduce, is_zp_float);
workspace.data_ptr(), b_q_type, has_bias, has_act_order, is_k_full,
has_zp, num_groups, group_size, dev,
at::cuda::getCurrentCUDAStream(dev), thread_k, thread_n, sms,
use_atomic_add, use_fp32_reduce, is_zp_float);
} else {
TORCH_CHECK(false,
"moe_wna16_marlin_gemm only supports bfloat16 and float16");

View File

@ -24,9 +24,12 @@
#ifndef USE_ROCM
#include <cub/util_type.cuh>
#include <cub/cub.cuh>
#include <cuda/std/functional>
using AddOp = cuda::std::plus<float>;
#else
#include <hipcub/util_type.hpp>
#include <hipcub/hipcub.hpp>
using AddOp = cub::Sum;
#endif
#define MAX(a, b) ((a) > (b) ? (a) : (b))
@ -62,7 +65,6 @@ __launch_bounds__(TPB) __global__
const int thread_row_offset = blockIdx.x * num_cols;
cub::Sum sum;
float threadData(-FLT_MAX);
// Don't touch finished rows.
@ -92,7 +94,7 @@ __launch_bounds__(TPB) __global__
threadData += exp((static_cast<float>(input[idx]) - float_max));
}
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, sum);
const auto Z = BlockReduce(tmpStorage).Reduce(threadData, AddOp());
if (threadIdx.x == 0)
{
@ -186,7 +188,9 @@ __launch_bounds__(TPB) __global__ void moeTopK(
It fuses the softmax, max and argmax into a single kernel.
Limitations:
1) This implementation is intended for when the number of experts is a small power of 2.
1) This implementation is optimized for when the number of experts is a small power of 2.
Additionally it also supports when number of experts is multiple of 64 which is still
faster than the computing softmax and topK separately (only tested on CUDA yet).
2) This implementation assumes k is small, but will work for any k.
*/
@ -196,8 +200,6 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE_PARAM) __global__
int* source_rows, const int k, const int start_expert, const int end_expert)
{
// We begin by enforcing compile time assertions and setting up compile time constants.
static_assert(VPT == (VPT & -VPT), "VPT must be power of 2");
static_assert(NUM_EXPERTS == (NUM_EXPERTS & -NUM_EXPERTS), "NUM_EXPERTS must be power of 2");
static_assert(BYTES_PER_LDG == (BYTES_PER_LDG & -BYTES_PER_LDG), "BYTES_PER_LDG must be power of 2");
static_assert(BYTES_PER_LDG <= 16, "BYTES_PER_LDG must be leq 16");
@ -405,12 +407,10 @@ struct TopkConstants
};
} // namespace detail
template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, typename IndType>
template <int EXPERTS, int WARPS_PER_TB, int WARP_SIZE_PARAM, int MAX_BYTES_PER_LDG, typename IndType>
void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, float* output, IndType* indices,
int* source_row, const int num_rows, const int k, const int start_expert, const int end_expert, cudaStream_t stream)
{
static constexpr std::size_t MAX_BYTES_PER_LDG = 16;
static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG, WARP_SIZE_PARAM>;
static constexpr int VPT = Constants::VPT;
@ -423,21 +423,27 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
input, finished, output, num_rows, indices, source_row, k, start_expert, end_expert);
}
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB) \
switch (warpSize) { \
case 32: \
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32>( \
gating_output, nullptr, topk_weights, topk_indices, \
token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
break; \
case 64: \
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64>( \
gating_output, nullptr, topk_weights, topk_indices, \
token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
break; \
default: \
TORCH_CHECK(false, "Unsupported warp size: ", warpSize); \
#ifndef USE_ROCM
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \
static_assert(WARP_SIZE == 32, \
"Unsupported warp size. Only 32 is supported for CUDA"); \
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, WARP_SIZE, MAX_BYTES>( \
gating_output, nullptr, topk_weights, topk_indices, \
token_expert_indices, num_tokens, topk, 0, num_experts, stream);
#else
#define LAUNCH_SOFTMAX(NUM_EXPERTS, WARPS_PER_TB, MAX_BYTES) \
if (WARP_SIZE == 64) { \
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 64, MAX_BYTES>( \
gating_output, nullptr, topk_weights, topk_indices, \
token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
} else if (WARP_SIZE == 32) { \
topkGatingSoftmaxLauncherHelper<NUM_EXPERTS, WARPS_PER_TB, 32, MAX_BYTES>( \
gating_output, nullptr, topk_weights, topk_indices, \
token_expert_indices, num_tokens, topk, 0, num_experts, stream); \
} else { \
assert(false && "Unsupported warp size. Only 32 and 64 are supported for ROCm"); \
}
#endif
template <typename IndType>
void topkGatingSoftmaxKernelLauncher(
@ -451,38 +457,64 @@ void topkGatingSoftmaxKernelLauncher(
const int topk,
cudaStream_t stream) {
static constexpr int WARPS_PER_TB = 4;
auto warpSize = WARP_SIZE;
static constexpr int BYTES_PER_LDG_POWER_OF_2 = 16;
#ifndef USE_ROCM
static constexpr int BYTES_PER_LDG_MULTIPLE_64 = 8;
#endif
switch (num_experts) {
case 1:
LAUNCH_SOFTMAX(1, WARPS_PER_TB);
LAUNCH_SOFTMAX(1, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
break;
case 2:
LAUNCH_SOFTMAX(2, WARPS_PER_TB);
LAUNCH_SOFTMAX(2, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
break;
case 4:
LAUNCH_SOFTMAX(4, WARPS_PER_TB);
LAUNCH_SOFTMAX(4, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
break;
case 8:
LAUNCH_SOFTMAX(8, WARPS_PER_TB);
LAUNCH_SOFTMAX(8, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
break;
case 16:
LAUNCH_SOFTMAX(16, WARPS_PER_TB);
LAUNCH_SOFTMAX(16, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
break;
case 32:
LAUNCH_SOFTMAX(32, WARPS_PER_TB);
LAUNCH_SOFTMAX(32, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
break;
case 64:
LAUNCH_SOFTMAX(64, WARPS_PER_TB);
LAUNCH_SOFTMAX(64, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
break;
case 128:
LAUNCH_SOFTMAX(128, WARPS_PER_TB);
LAUNCH_SOFTMAX(128, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
break;
case 256:
LAUNCH_SOFTMAX(256, WARPS_PER_TB);
LAUNCH_SOFTMAX(256, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
break;
case 512:
LAUNCH_SOFTMAX(512, WARPS_PER_TB, BYTES_PER_LDG_POWER_OF_2);
break;
// (CUDA only) support multiples of 64 when num_experts is not power of 2.
// ROCm uses WARP_SIZE 64 so 8 bytes loading won't fit for some of num_experts,
// alternatively we can test 4 bytes loading and enable it in future.
#ifndef USE_ROCM
case 192:
LAUNCH_SOFTMAX(192, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
break;
case 320:
LAUNCH_SOFTMAX(320, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
break;
case 384:
LAUNCH_SOFTMAX(384, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
break;
case 448:
LAUNCH_SOFTMAX(448, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
break;
case 576:
LAUNCH_SOFTMAX(576, WARPS_PER_TB, BYTES_PER_LDG_MULTIPLE_64);
break;
#endif
default: {
TORCH_CHECK(softmax_workspace != nullptr,
"softmax_workspace must be provided for num_experts that are not a power of 2.");
"softmax_workspace must be provided for num_experts that are not a power of 2 or multiple of 64.");
static constexpr int TPB = 256;
moeSoftmax<TPB><<<num_tokens, TPB, 0, stream>>>(
gating_output, nullptr, softmax_workspace, num_experts);

View File

@ -35,7 +35,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
m.def(
"moe_wna16_marlin_gemm(Tensor! a, Tensor? c_or_none,"
"Tensor! b_q_weight, Tensor! b_scales, Tensor? global_scale, Tensor? "
"Tensor! b_q_weight, Tensor? b_bias_or_none,"
"Tensor! b_scales, Tensor? global_scale, Tensor? "
"b_zeros_or_none,"
"Tensor? g_idx_or_none, Tensor? perm_or_none, Tensor! workspace,"
"Tensor sorted_token_ids,"

View File

@ -138,6 +138,8 @@ void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input);
void fatrelu_and_mul(torch::Tensor& out, torch::Tensor& input,
double threshold);
void swigluoai_and_mul(torch::Tensor& out, torch::Tensor& input,
double alpha = 1.702, double limit = 7.0);
void gelu_new(torch::Tensor& out, torch::Tensor& input);
@ -145,22 +147,6 @@ void gelu_fast(torch::Tensor& out, torch::Tensor& input);
void gelu_quick(torch::Tensor& out, torch::Tensor& input);
void advance_step_flashattn(int64_t num_seqs, int64_t num_queries,
int64_t block_size, torch::Tensor& input_tokens,
torch::Tensor& sampled_token_ids,
torch::Tensor& input_positions,
torch::Tensor& seq_lens,
torch::Tensor& slot_mapping,
torch::Tensor& block_tables);
void advance_step_flashinfer(
int64_t num_seqs, int64_t num_queries, int64_t block_size,
torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
torch::Tensor& input_positions, torch::Tensor& seq_lens,
torch::Tensor& slot_mapping, torch::Tensor& block_tables,
torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bounds);
void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
torch::Tensor const& q_pe,
torch::Tensor const& kv_c_and_k_pe_cache,
@ -170,15 +156,6 @@ void cutlass_mla_decode(torch::Tensor const& out, torch::Tensor const& q_nope,
torch::Tensor get_cuda_view_from_cpu_tensor(torch::Tensor& cpu_tensor);
#ifndef USE_ROCM
torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
const torch::Tensor& codebooks,
const torch::Tensor& scales,
const std::vector<int64_t>& codebook_partition_sizes,
const std::optional<torch::Tensor>& bias);
torch::Tensor aqlm_dequant(
const torch::Tensor& codes, const torch::Tensor& codebooks,
const std::vector<int64_t>& codebook_partition_sizes);
torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
torch::Tensor _scaling_factors, torch::Tensor _zeros,

View File

@ -1,336 +0,0 @@
/*
* The goal of this GPU kernel is to advance input tensors on the GPU directly
* PR: https://github.com/vllm-project/vllm/pull/6338
* Current restrictions:
* 1. Specialized for DraftModelRunner
* 2. Supports flash_attn only
*/
#include "advance_step.cuh"
namespace prepare_inputs {
//
template <int const num_threads>
__global__ void advance_step_flashattn_kernel(
int num_seqs, int num_queries, int block_size, long* input_tokens_ptr,
long const* sampled_token_ids_ptr, long* input_positions_ptr,
int* seq_lens_ptr, long* slot_mapping_ptr, int const* block_tables_ptr,
int64_t const block_tables_stride) {
int const n_pad = num_seqs - num_queries;
if (n_pad && blockIdx.x == 0) {
// Handle cuda graph padding
int const offset = num_queries;
for (int i = threadIdx.x; i < n_pad; i += blockDim.x) {
input_tokens_ptr[offset + i] = 0;
input_positions_ptr[offset + i] = 0;
slot_mapping_ptr[offset + i] = -1;
}
}
int num_query_blocks = div_ceil(num_queries, num_threads);
if (blockIdx.x >= num_query_blocks) {
return;
}
int cur_query_id = blockIdx.x * num_threads + threadIdx.x;
if (cur_query_id >= num_queries) {
return;
}
// Update input_tokens
input_tokens_ptr[cur_query_id] = sampled_token_ids_ptr[cur_query_id];
int seq_len = seq_lens_ptr[cur_query_id];
int next_seq_len = seq_len + 1;
int next_input_pos = next_seq_len - 1;
// Update seq_lens
seq_lens_ptr[cur_query_id] = next_seq_len;
// Update input_positions
input_positions_ptr[cur_query_id] = next_input_pos;
int const* seq_block_tables_ptr =
block_tables_ptr + block_tables_stride * cur_query_id;
int block_index = next_input_pos / block_size;
int block_offset = next_input_pos % block_size;
int slot_num = seq_block_tables_ptr[block_index] * block_size + block_offset;
// Update slot_mapping
slot_mapping_ptr[cur_query_id] = slot_num;
}
inline void verify_tensor(std::string const& name, torch::Tensor const& t,
int64_t const size_0, int64_t const size_1,
c10::ScalarType const type) {
bool size_0_cond = true;
if (size_0 != -1) {
size_0_cond = t.size(0) == size_0;
}
bool size_1_cond = true;
if (size_1 != -1) {
size_1_cond = t.size(1) == size_1;
}
bool is_contiguous = t.is_contiguous();
bool same_type = t.dtype() == type;
bool pass = size_0_cond && size_1_cond && is_contiguous && same_type;
if (!pass) {
TORCH_CHECK(false, "tensor: name = ", name, ", shape = ", t.sizes(),
" is_cont = ", t.is_contiguous(), ", type = ", t.dtype(),
" is not as expected: shape = [", size_0, ", ", size_1,
"], type = ", type);
}
}
/// each thread processes a block per query
__global__ void advance_step_flashinfer_kernel(
int num_threads, int num_seqs, int num_queries, int block_size,
long* input_tokens_ptr, long const* sampled_token_ids_ptr,
long* input_positions_ptr, int* seq_lens_ptr, long* slot_mapping_ptr,
int const* block_tables_ptr, int64_t const block_tables_stride,
int* paged_kv_last_page_len_ptr, int* block_table_bound_ptr) {
int const n_pad = num_seqs - num_queries;
if (n_pad && blockIdx.x == 0) {
// Handle cuda graph padding
int const offset = num_queries;
for (int i = threadIdx.x; i < n_pad; i += blockDim.x) {
input_tokens_ptr[offset + i] = 0;
input_positions_ptr[offset + i] = 0;
slot_mapping_ptr[offset + i] = -1;
}
}
int num_query_blocks = div_ceil(num_queries, num_threads);
if (blockIdx.x < num_query_blocks) {
int cur_query_id = blockIdx.x * num_threads + threadIdx.x;
if (cur_query_id < num_queries) {
// Update input_tokens
input_tokens_ptr[cur_query_id] = sampled_token_ids_ptr[cur_query_id];
int seq_len = seq_lens_ptr[cur_query_id];
int next_seq_len = seq_len + 1;
int next_input_pos = next_seq_len - 1;
// Update seq_lens
seq_lens_ptr[cur_query_id] = next_seq_len;
// Update input_positions
input_positions_ptr[cur_query_id] = next_input_pos;
int const* seq_block_tables_ptr =
block_tables_ptr + block_tables_stride * cur_query_id;
int block_index = next_input_pos / block_size;
int block_offset = next_input_pos % block_size;
// Update paged_kv_last_page_len
paged_kv_last_page_len_ptr[cur_query_id] = block_offset + 1;
int slot_num =
seq_block_tables_ptr[block_index] * block_size + block_offset;
// Update slot_mapping
slot_mapping_ptr[cur_query_id] = slot_num;
block_table_bound_ptr[cur_query_id] = div_ceil(next_seq_len, block_size);
}
}
}
__global__ void advance_step_flashinfer_indptr_kernel(
int num_threads, int num_seqs, int num_queries, int* paged_kv_indptr_ptr,
int* block_table_bound_ptr) {
int idx = blockIdx.x * num_threads + threadIdx.x;
// Update paged_kv_indptr
if (idx == 0) {
paged_kv_indptr_ptr[idx] = 0;
}
if (idx < num_queries) {
int sum = 0;
for (int i = 0; i <= idx; ++i) {
sum += block_table_bound_ptr[i];
}
paged_kv_indptr_ptr[idx + 1] = sum;
}
}
__global__ void advance_step_flashinfer_indices_kernel(
int num_seqs, int num_queries, int const* block_tables_ptr,
int64_t const max_num_blocks_per_seq, int* paged_kv_indices_ptr,
int* paged_kv_indptr_ptr, int* block_table_bound_ptr) {
// note: max_num_blocks_per_seq = block_tables.stride(0)
int tid = blockIdx.x * blockDim.x + threadIdx.x;
// when cuda graphs are enabled, paged_kv_indptr tensor
// has to be updated for the padded queries
// tid represents a query# for paged_kv_indptr tensor
if (num_queries < tid && tid <= num_seqs) {
paged_kv_indptr_ptr[tid] = paged_kv_indptr_ptr[num_queries];
}
// each thread processes a block_ptr in block_tables
// block_tables shape: [num_queries, max_num_blocks_per_seq]
// paged_kv_indices is flattened block_tables.
for (int idx = tid; idx < (num_seqs * max_num_blocks_per_seq);
idx += (gridDim.x * blockDim.x)) {
// block_tables-row = paged_kv_indptr[queryNum]
int queryNum = idx / max_num_blocks_per_seq;
int col = idx % max_num_blocks_per_seq;
if (queryNum < num_queries && col < block_table_bound_ptr[queryNum]) {
int indices_arr_idx = paged_kv_indptr_ptr[queryNum] + col;
int block_tables_idx = queryNum * max_num_blocks_per_seq + col;
paged_kv_indices_ptr[indices_arr_idx] =
block_tables_ptr[block_tables_idx];
}
}
}
void advance_step_flashattn(int num_seqs, int num_queries, int block_size,
torch::Tensor& input_tokens, // type: long
torch::Tensor& sampled_token_ids, // type: long
torch::Tensor& input_positions, // type: long
torch::Tensor& seq_lens, // type: int
torch::Tensor& slot_mapping, // type: long
torch::Tensor& block_tables) { // type: int
if (logging) {
printf("advance_step_flashattn:\n");
printf(" num_seqs = %d\n", num_seqs);
printf(" num_queries = %d\n", num_queries);
printf(" block_size = %d\n", block_size);
}
// Verify all tensors
verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong);
verify_tensor("sampled_token_ids", sampled_token_ids, num_queries, 1,
at::kLong);
verify_tensor("input_positions", input_positions, num_seqs, -1, at::kLong);
verify_tensor("seq_lens", seq_lens, num_seqs, -1, at::kInt);
verify_tensor("slot_mapping", slot_mapping, num_seqs, -1, at::kLong);
verify_tensor("block_tables", block_tables, num_seqs, -1, at::kInt);
int dev = sampled_token_ids.get_device();
cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
int blocks;
cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
advance_step_flashattn_kernel<max_threads>
<<<blocks, max_threads, 0, stream>>>(
num_seqs, num_queries, block_size,
reinterpret_cast<long*>(input_tokens.data_ptr()),
reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
reinterpret_cast<long*>(input_positions.data_ptr()),
reinterpret_cast<int*>(seq_lens.data_ptr()),
reinterpret_cast<long*>(slot_mapping.data_ptr()),
reinterpret_cast<int const*>(block_tables.data_ptr()),
block_tables.stride(0));
}
void advance_step_flashinfer(
int num_seqs, int num_queries, int block_size,
torch::Tensor& input_tokens, // type: long
torch::Tensor& sampled_token_ids, // type: long
torch::Tensor& input_positions, // type: long
torch::Tensor& seq_lens, // type: int
torch::Tensor& slot_mapping, // type: long
torch::Tensor& block_tables, // type: int
torch::Tensor& paged_kv_indices, // type: int
torch::Tensor& paged_kv_indptr, // type: int
torch::Tensor& paged_kv_last_page_len, // type: int
torch::Tensor& block_table_bound) { // type: int
if (logging) {
printf("advance_step_flashinfer:\n");
printf(" num_seqs = %d\n", num_seqs);
printf(" num_queries = %d\n", num_queries);
printf(" block_size = %d\n", block_size);
printf(" block_tables.stride(0) = %zu\n", block_tables.stride(0));
}
// Verify all tensors
verify_tensor("input_tokens", input_tokens, num_seqs, -1, at::kLong);
// verify_tensor("sampled_token_ids", sampled_token_ids, num_queries, 1,
// at::kLong);
verify_tensor("input_positions", input_positions, num_seqs, -1, at::kLong);
verify_tensor("seq_lens", seq_lens, num_seqs, -1, at::kInt);
verify_tensor("slot_mapping", slot_mapping, num_seqs, -1, at::kLong);
verify_tensor("block_tables", block_tables, num_seqs, -1, at::kInt);
verify_tensor("paged_kv_indices", paged_kv_indices, -1, -1, at::kInt);
verify_tensor("paged_kv_indptr", paged_kv_indptr, num_seqs + 1, -1, at::kInt);
verify_tensor("paged_kv_last_page_len", paged_kv_last_page_len, num_seqs, -1,
at::kInt);
verify_tensor("block_table_bound", block_table_bound, num_seqs, -1, at::kInt);
int dev = sampled_token_ids.get_device();
cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
int blocks;
int threads;
cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
cudaDeviceGetAttribute(&threads, cudaDevAttrMaxThreadsPerBlock, dev);
TORCH_CHECK((blocks * threads > num_queries),
"multi-step: not enough threads to map to num_queries = ",
num_queries, " block_tables.stride(0) = ", block_tables.stride(0),
" blocks = ", blocks, " max_threads = ", threads);
if (logging) {
printf("launching kernels with %d blocks and %d threads\n", blocks,
threads);
}
advance_step_flashinfer_kernel<<<blocks, threads, 0, stream>>>(
threads, num_seqs, num_queries, block_size,
reinterpret_cast<long*>(input_tokens.data_ptr()),
reinterpret_cast<long const*>(sampled_token_ids.data_ptr()),
reinterpret_cast<long*>(input_positions.data_ptr()),
reinterpret_cast<int*>(seq_lens.data_ptr()),
reinterpret_cast<long*>(slot_mapping.data_ptr()),
reinterpret_cast<int const*>(block_tables.data_ptr()),
block_tables.stride(0),
reinterpret_cast<int*>(paged_kv_last_page_len.data_ptr()),
reinterpret_cast<int*>(block_table_bound.data_ptr()));
advance_step_flashinfer_indptr_kernel<<<blocks, threads, 0, stream>>>(
threads, num_seqs, num_queries,
reinterpret_cast<int*>(paged_kv_indptr.data_ptr()),
reinterpret_cast<int*>(block_table_bound.data_ptr()));
advance_step_flashinfer_indices_kernel<<<blocks, threads, 0, stream>>>(
num_seqs, num_queries,
reinterpret_cast<int const*>(block_tables.data_ptr()),
block_tables.stride(0),
reinterpret_cast<int*>(paged_kv_indices.data_ptr()),
reinterpret_cast<int*>(paged_kv_indptr.data_ptr()),
reinterpret_cast<int*>(block_table_bound.data_ptr()));
}
} // namespace prepare_inputs
void advance_step_flashattn(int64_t num_seqs, int64_t num_queries,
int64_t block_size, torch::Tensor& input_tokens,
torch::Tensor& sampled_token_ids,
torch::Tensor& input_positions,
torch::Tensor& seq_lens,
torch::Tensor& slot_mapping,
torch::Tensor& block_tables) {
prepare_inputs::advance_step_flashattn(
num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
input_positions, seq_lens, slot_mapping, block_tables);
}
void advance_step_flashinfer(
int64_t num_seqs, int64_t num_queries, int64_t block_size,
torch::Tensor& input_tokens, torch::Tensor& sampled_token_ids,
torch::Tensor& input_positions, torch::Tensor& seq_lens,
torch::Tensor& slot_mapping, torch::Tensor& block_tables,
torch::Tensor& paged_kv_indices, torch::Tensor& paged_kv_indptr,
torch::Tensor& paged_kv_last_page_len, torch::Tensor& block_table_bound) {
prepare_inputs::advance_step_flashinfer(
num_seqs, num_queries, block_size, input_tokens, sampled_token_ids,
input_positions, seq_lens, slot_mapping, block_tables, paged_kv_indices,
paged_kv_indptr, paged_kv_last_page_len, block_table_bound);
}

View File

@ -1,19 +0,0 @@
#pragma once
#include <torch/all.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <iostream>
namespace prepare_inputs {
static constexpr int max_threads = 256;
static constexpr bool logging = false;
constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
} // namespace prepare_inputs

View File

@ -1,597 +0,0 @@
/*
* Modified by Neural Magic
* Adapted from https://github.com/Vahe1994/AQLM
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <torch/all.h>
#include <c10/cuda/CUDAStream.h>
#include <c10/cuda/CUDAGuard.h>
#include <iostream>
#include <cstdlib>
namespace vllm {
namespace aqlm {
__global__ void Code1x16MatVec(
const int4* __restrict__ A, const int4* __restrict__ B,
int4* __restrict__ C, const int4* __restrict__ codebook, const int prob_m,
const int prob_k,
const int4 codebook_a_sizes, // cumulative sizes of A spanning each
// codebook, at most 3 long.
const int codebook_stride // as int4.
) {
int a_gl_stride = prob_k / 8 / 8;
int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
bool pred = a_gl_rd < prob_m;
if (pred) {
// advance to the correct codebook, this easy because we only multiply one
// column of the codebook.
auto codebook_size = &codebook_a_sizes.x;
while (a_gl_rd >= *codebook_size) {
codebook += codebook_stride;
++codebook_size;
}
}
int b_gl_rd = 0;
int c_gl_wr = a_gl_rd;
a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
__shared__ int4 sh_b[32 * 9];
float res = 0;
int iters = (prob_k / 8 + 8 * 32 - 1) / (8 * 32);
while (iters--) {
// We pad shared memory to avoid bank conflicts during reads
__syncthreads();
for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) {
if (b_gl_rd + i < prob_k / 8) sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i];
}
__syncthreads();
b_gl_rd += 32 * 8;
int b_sh_rd = 9 * (threadIdx.x % 32);
if (pred && a_gl_rd < a_gl_end) {
const uint16_t* enc = reinterpret_cast<const uint16_t*>(&A[a_gl_rd]);
#pragma unroll
for (int i = 0; i < 8; i++) {
uint32_t dec[4];
// We bypass the L1 cache to avoid massive amounts of memory streaming
// that doesn't actually help us; this brings > 2x speedup.
asm volatile("ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
: "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3])
: "l"((void*)&codebook[enc[i]]));
half2* a = reinterpret_cast<half2*>(&dec);
half2* b = reinterpret_cast<half2*>(&sh_b[b_sh_rd]);
half2 res2 = {};
#pragma unroll
for (int j = 0; j < 4; j++) res2 = __hfma2(a[j], b[j], res2);
res += __half2float(res2.x) + __half2float(res2.y);
b_sh_rd++;
}
a_gl_rd += 32;
}
}
if (pred) {
#pragma unroll
for (int i = 16; i > 0; i /= 2) res += __shfl_down_sync(0xffffffff, res, i);
if (threadIdx.x % 32 == 0)
reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res);
}
}
__global__ void Code2x8MatVec(
const int4* __restrict__ A, const int4* __restrict__ B,
int4* __restrict__ C, const int4* __restrict__ codebook, int prob_m,
int prob_k,
const int4 codebook_a_sizes, // cumulative sizes of A spanning each
// codebook, at most 3 long.
const int codebook_stride // as int4.
) {
int a_gl_stride = prob_k / 8 / 8;
int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
bool pred = a_gl_rd < prob_m;
if (pred) {
// advance to the correct codebook, this easy because we only multiply one
// column of the codebook.
auto codebook_size = &codebook_a_sizes.x;
while (a_gl_rd >= *codebook_size) {
codebook += codebook_stride;
++codebook_size;
}
}
int b_gl_rd = 0;
int c_gl_wr = a_gl_rd;
a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
int lane = threadIdx.x % 8;
extern __shared__ int4 sh[];
int4* sh_b = sh;
int4* sh_code = sh_b + 32 * 9;
int4* sh_code0 = sh_code;
int4* sh_code1 = sh_code + 256 * 8;
for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) {
int4 dec = codebook[i];
#pragma unroll
for (int j = 0; j < 8; j++) sh_code[8 * i + (j + lane) % 8] = dec;
}
__syncthreads();
float res = 0;
int iters = (prob_k / 8 + 8 * 32 - 1) / (8 * 32);
while (iters--) {
// We pad shared memory to avoid bank conflicts during reads
__syncthreads();
for (int i = threadIdx.x; i < 32 * 8; i += blockDim.x) {
if (b_gl_rd + i < prob_k / 8) sh_b[9 * (i / 8) + i % 8] = B[b_gl_rd + i];
}
__syncthreads();
b_gl_rd += 32 * 8;
int b_sh_rd = 9 * (threadIdx.x % 32);
if (pred && a_gl_rd < a_gl_end) {
const uint8_t* enc = reinterpret_cast<const uint8_t*>(&A[a_gl_rd]);
#pragma unroll
for (int i = 0; i < 8; i++) {
half2* a0 =
reinterpret_cast<half2*>(&sh_code0[8 * enc[2 * i + 0] + lane]);
half2* a1 =
reinterpret_cast<half2*>(&sh_code1[8 * enc[2 * i + 1] + lane]);
half2* b = reinterpret_cast<half2*>(&sh_b[b_sh_rd]);
half2 res2 = {};
#pragma unroll
for (int j = 0; j < 4; j++)
res2 = __hfma2(__hadd2(a0[j], a1[j]), b[j], res2);
res += __half2float(res2.x) + __half2float(res2.y);
b_sh_rd++;
}
a_gl_rd += 32;
}
}
if (pred) {
#pragma unroll
for (int i = 16; i > 0; i /= 2) res += __shfl_down_sync(0xffffffff, res, i);
if (threadIdx.x % 32 == 0)
reinterpret_cast<__half*>(C)[c_gl_wr] = __float2half(res);
}
}
__global__ void Code1x16Dequant(
const int4* __restrict__ A, int4* __restrict__ C,
const int4* __restrict__ codebook, int prob_m, int prob_k,
const int4 codebook_a_sizes, // cumulative sizes of A spanning each
// codebook, at most 3 long, sums to m.
const int codebook_stride // as int4
) {
int a_gl_stride = prob_k / 8 / 8;
int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
bool pred = a_gl_rd < prob_m;
if (pred) {
// advance to the correct codebook, this easy because we only multiply one
// column of the codebook.
auto codebook_size = &codebook_a_sizes.x;
while (a_gl_rd >= *codebook_size) {
codebook += codebook_stride;
++codebook_size;
}
}
a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
int c_gl_stride = prob_k / 8;
int c_gl_wr = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
c_gl_wr = c_gl_stride * c_gl_wr + (threadIdx.x % 32) * 8;
int iters = (prob_k / 8 - 1) / (8 * 32) + 1;
while (iters--) {
if (pred && a_gl_rd < a_gl_end) {
const uint16_t* enc = reinterpret_cast<const uint16_t*>(&A[a_gl_rd]);
#pragma unroll
for (int i = 0; i < 8; i++) {
int4 chunk;
auto dec = reinterpret_cast<uint32_t*>(&chunk);
// We bypass the L1 cache to avoid massive amounts of memory streaming
// that doesn't actually help us; this brings > 2x speedup.
asm volatile("ld.cg.global.v4.u32 {%0, %1, %2, %3}, [%4];"
: "=r"(dec[0]), "=r"(dec[1]), "=r"(dec[2]), "=r"(dec[3])
: "l"((void*)&codebook[enc[i]]));
C[a_gl_rd * 8 + i] = chunk;
}
}
a_gl_rd += 32;
}
}
__global__ void Code2x8Dequant(
const int4* __restrict__ A, int4* __restrict__ C,
const int4* __restrict__ codebook, int prob_m, int prob_k,
const int4
codebook_a_sizes, // cumulative sizes of A spanning each codebook, at
// most 3 long, corresponds to cols.
const int codebook_stride // as int4
) {
int a_gl_stride = prob_k / 8 / 8;
int a_gl_rd = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
bool pred = a_gl_rd < prob_m;
if (pred) {
// advance to the correct codebook, this easy because we only multiply one
// column of the codebook.
auto codebook_size = &codebook_a_sizes.x;
while (a_gl_rd >= *codebook_size) {
codebook += codebook_stride;
++codebook_size;
}
}
a_gl_rd = a_gl_stride * a_gl_rd + threadIdx.x % 32;
int a_gl_end = a_gl_rd + a_gl_stride - threadIdx.x % 32;
int lane = threadIdx.x % 8;
int c_gl_stride = prob_k / 8;
int c_gl_wr = (blockDim.x / 32) * blockIdx.x + (threadIdx.x / 32);
c_gl_wr = c_gl_stride * c_gl_wr + (threadIdx.x % 32) * 8;
extern __shared__ int4 sh[];
int4* sh_code = sh;
int4* sh_code0 = sh_code;
int4* sh_code1 = sh_code + 256 * 8;
for (int i = threadIdx.x; i < 2 * 256; i += blockDim.x) {
int4 dec = codebook[i];
#pragma unroll
for (int j = 0; j < 8; j++) sh_code[8 * i + (j + lane) % 8] = dec;
}
__syncthreads();
int iters = (prob_k / 8 - 1) / (8 * 32) + 1;
while (iters--) {
if (pred && a_gl_rd < a_gl_end) {
const uint8_t* enc = reinterpret_cast<const uint8_t*>(&A[a_gl_rd]);
#pragma unroll
for (int i = 0; i < 8; i++) {
int4 chunk;
half2* a0 =
reinterpret_cast<half2*>(&sh_code0[8 * enc[2 * i + 0] + lane]);
half2* a1 =
reinterpret_cast<half2*>(&sh_code1[8 * enc[2 * i + 1] + lane]);
#pragma unroll
for (int j = 0; j < 4; j++)
reinterpret_cast<half2*>(&chunk)[j] = __hadd2(a0[j], a1[j]);
C[a_gl_rd * 8 + i] = chunk;
}
}
a_gl_rd += 32;
}
}
inline int ceildiv(int a, int b) { return (a + b - 1) / b; }
const int THREAD_M = 16;
void code1x16_matvec_cuda(const void* __restrict__ A,
const void* __restrict__ B, void* __restrict__ C,
const void* __restrict__ codebook, int prob_m,
int prob_k, const int4 codebook_a_sizes,
const int codebook_stride) {
int sms;
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
int waves = 0;
int thread_m;
do {
waves++;
thread_m = ceildiv(prob_m, waves * sms);
} while (thread_m > THREAD_M);
int blocks = ceildiv(prob_m, thread_m);
int threads = 32 * thread_m;
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
Code1x16MatVec<<<blocks, threads, 16 * 32 * 9, stream>>>(
(const int4*)A, (const int4*)B, (int4*)C, (const int4*)codebook, prob_m,
prob_k, codebook_a_sizes, codebook_stride);
}
void code2x8_matvec_cuda(const void* __restrict__ A, const void* __restrict__ B,
void* __restrict__ C,
const void* __restrict__ codebook, int prob_m,
int prob_k, const int4 codebook_a_sizes,
const int codebook_stride) {
int sms;
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
int waves = 0;
int thread_m;
do {
waves++;
thread_m = ceildiv(prob_m, waves * sms);
} while (thread_m > THREAD_M);
int blocks = ceildiv(prob_m, thread_m);
int threads = 32 * thread_m;
int shared = 16 * (2 * 256 * 8 + 32 * 9);
cudaFuncSetAttribute(Code2x8MatVec,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared);
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
Code2x8MatVec<<<blocks, threads, shared, stream>>>(
(const int4*)A, (const int4*)B, (int4*)C, (const int4*)codebook, prob_m,
prob_k, codebook_a_sizes, codebook_stride);
}
void code1x16_dequant_cuda(
const void* __restrict__ A, void* __restrict__ C,
const void* __restrict__ codebook, int prob_m, int prob_k,
const int4 codebook_a_sizes, // cumulative sizes of A spanning each
// codebook, at most 3 long.
const int codebook_stride // as int4.
) {
int sms;
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
int waves = 0;
int thread_m;
do {
waves++;
thread_m = ceildiv(prob_m, waves * sms);
} while (thread_m > THREAD_M);
int blocks = ceildiv(prob_m, thread_m);
int threads = 32 * thread_m;
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
Code1x16Dequant<<<blocks, threads, 0, stream>>>(
(const int4*)A, (int4*)C, (const int4*)codebook, prob_m, prob_k,
codebook_a_sizes, // cumulative sizes of A spanning each codebook, at
// most 3 long.
codebook_stride // as int4.
);
}
// Dequantizes the code and codebook into weights.
void code2x8_dequant_cuda(
const void* __restrict__ A, void* __restrict__ C,
const void* __restrict__ codebook, int prob_m, int prob_k,
const int4
codebook_a_sizes, // cumulative sizes of A spanning each codebook, at
// most 3 long, corresponds to cols.
const int codebook_stride // as int4
) {
int sms;
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount, 0);
int waves = 0;
int thread_m;
do {
waves++;
thread_m = ceildiv(prob_m, waves * sms);
} while (thread_m > THREAD_M);
int blocks = ceildiv(prob_m, thread_m);
int threads = 32 * thread_m;
int shared = 16 * (2 * 256 * 8 + 32 * 9);
cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
cudaFuncSetAttribute(Code2x8Dequant,
cudaFuncAttributeMaxDynamicSharedMemorySize, shared);
Code2x8Dequant<<<blocks, threads, shared, stream>>>(
(const int4*)A, (int4*)C, (const int4*)codebook, prob_m, prob_k,
codebook_a_sizes, codebook_stride);
}
int codebook_stride(const torch::Tensor& codebooks) {
return codebooks.stride(0) * codebooks.element_size() / sizeof(int4);
}
void code1x16_matvec(
const torch::Tensor& A, const torch::Tensor& B, torch::Tensor& C,
const torch::Tensor& codebook,
const int4 codebook_a_sizes // cumulative sizes of A spanning each
// codebook, at most 3 long.
) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
int prob_m = C.size(0);
int prob_k = B.size(0);
code1x16_matvec_cuda(A.data_ptr(), B.data_ptr(), C.data_ptr(),
codebook.data_ptr(), prob_m, prob_k, codebook_a_sizes,
codebook_stride(codebook));
}
torch::Tensor code1x16_matmat(const torch::Tensor& input,
const torch::Tensor& codes,
const torch::Tensor& codebooks,
const torch::Tensor& scales,
const int4 codebook_a_sizes,
const std::optional<torch::Tensor>& bias) {
auto input_sizes = input.sizes();
auto out_features = codes.size(0) * codebooks.size(2);
auto flat_input = input.reshape({-1, input.size(-1)});
auto flat_output = torch::empty(
{flat_input.size(0), out_features},
torch::TensorOptions().dtype(input.dtype()).device(input.device()));
for (int i = 0; i < flat_input.size(0); ++i) {
auto input_vec = flat_input.index({i});
auto output_vec = flat_output.index({i});
code1x16_matvec(codes.squeeze(2), input_vec, output_vec, codebooks,
codebook_a_sizes);
}
flat_output *= scales.flatten().unsqueeze(0);
if (bias.has_value()) {
flat_output += bias->unsqueeze(0);
}
auto output_sizes = input_sizes.vec();
output_sizes.pop_back();
output_sizes.push_back(-1);
auto output = flat_output.reshape(output_sizes);
return output;
}
void code2x8_matvec(const torch::Tensor& A, const torch::Tensor& B,
torch::Tensor& C, const torch::Tensor& codebook,
const int4 codebook_a_sizes) {
const at::cuda::OptionalCUDAGuard device_guard(device_of(A));
int prob_m = C.size(0);
int prob_k = B.size(0);
code2x8_matvec_cuda(A.data_ptr(), B.data_ptr(), C.data_ptr(),
codebook.data_ptr(), prob_m, prob_k, codebook_a_sizes,
2 * codebook_stride(codebook));
}
torch::Tensor code2x8_matmat(const torch::Tensor& input,
const torch::Tensor& codes,
const torch::Tensor& codebooks,
const torch::Tensor& scales,
const int4 codebook_a_sizes,
const std::optional<torch::Tensor>& bias) {
auto input_sizes = input.sizes();
auto out_features = codes.size(0) * codebooks.size(2);
auto flat_input = input.reshape({-1, input.size(-1)});
auto flat_output = torch::empty(
{flat_input.size(0), out_features},
torch::TensorOptions().dtype(input.dtype()).device(input.device()));
for (int i = 0; i < flat_input.size(0); ++i) {
auto input_vec = flat_input.index({i});
auto output_vec = flat_output.index({i});
code2x8_matvec(codes.squeeze(2), input_vec, output_vec, codebooks,
codebook_a_sizes);
}
flat_output *= scales.flatten().unsqueeze(0);
if (bias.has_value()) {
flat_output += bias->unsqueeze(0);
}
auto output_sizes = input_sizes.vec();
output_sizes.pop_back();
output_sizes.push_back(-1);
auto output = flat_output.reshape(output_sizes);
return output;
}
// Accumulate the partition sizes.
int4 accumulate_sizes(const std::vector<int64_t>& codebook_partition_sizes) {
int4 cumulative_sizes;
auto cumulative_size = &cumulative_sizes.x;
size_t i = 0;
int last = 0;
assert(codebook_partition_sizes.size() <= 4);
for (; i < codebook_partition_sizes.size(); ++i, ++cumulative_size) {
*cumulative_size = codebook_partition_sizes[i] + last;
last = *cumulative_size;
}
// fill in the rest with unreachable.
for (; i < 4; ++i, ++cumulative_size) {
*cumulative_size = last * 10;
}
return cumulative_sizes;
}
} // namespace aqlm
} // namespace vllm
torch::Tensor aqlm_gemm(const torch::Tensor& input, const torch::Tensor& codes,
const torch::Tensor& codebooks,
const torch::Tensor& scales,
const std::vector<int64_t>& codebook_partition_sizes,
const std::optional<torch::Tensor>& bias) {
int4 cumulative_sizes =
vllm::aqlm::accumulate_sizes(codebook_partition_sizes);
int const nbooks = codebooks.size(0) / codebook_partition_sizes.size();
int const entries = codebooks.size(1);
if (nbooks == 1 && entries == (1 << 16)) {
return vllm::aqlm::code1x16_matmat(input, codes, codebooks, scales,
cumulative_sizes, bias);
}
if (nbooks == 2 && entries == (1 << 8)) {
return vllm::aqlm::code2x8_matmat(input, codes, codebooks, scales,
cumulative_sizes, bias);
}
TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries,
" entries is not currently supported.")
return {};
}
torch::Tensor aqlm_dequant(
const torch::Tensor& codes, const torch::Tensor& codebooks,
const std::vector<int64_t>& codebook_partition_sizes) {
int4 cumulative_sizes =
vllm::aqlm::accumulate_sizes(codebook_partition_sizes);
int const nbooks = codebooks.size(0) / codebook_partition_sizes.size();
int const entries = codebooks.size(1);
const at::cuda::OptionalCUDAGuard device_guard(device_of(codes));
int rows = codes.size(1);
int cols = codes.size(0);
auto in_features = codes.size(1) * 8;
auto out_features = codes.size(0);
assert(out_features == std::accumulate(codebook_partition_sizes.begin(),
codebook_partition_sizes.end(), 0));
auto weights = torch::empty({out_features, in_features},
torch::TensorOptions()
.dtype(codebooks.dtype())
.device(codebooks.device()));
if (nbooks == 1 && entries == (1 << 16)) {
vllm::aqlm::code1x16_dequant_cuda(codes.data_ptr(), weights.data_ptr(),
codebooks.data_ptr(), out_features,
in_features, cumulative_sizes,
vllm::aqlm::codebook_stride(codebooks));
// if you wanted to flip to scaling the weights, (though it's 30%-ish slower
// and not consistent with gemv implementation.) weights *=
// scales.index({"...", 0, 0});
return weights;
}
if (nbooks == 2 && entries == (1 << 8)) {
vllm::aqlm::code2x8_dequant_cuda(codes.data_ptr(), weights.data_ptr(),
codebooks.data_ptr(), out_features,
in_features, cumulative_sizes,
vllm::aqlm::codebook_stride(codebooks));
// if you wanted to flip to scaling the weights, (though it's 30%-ish slower
// and not consistent with gemv implementation) weights *=
// scales.index({"...", 0, 0});
return weights;
}
TORCH_CHECK(false, "AQLM with ", nbooks, " codebooks and ", entries,
" entries is not currently supported.")
return {};
}

View File

@ -1,7 +1,9 @@
#include <ATen/cuda/CUDAContext.h>
#include <torch/all.h>
#include "../per_token_group_quant_8bit.h"
#ifndef USE_ROCM
#include "../per_token_group_quant_8bit.h"
#endif
#include <cmath>
@ -339,6 +341,7 @@ void dynamic_scaled_int8_quant(
});
}
#ifndef USE_ROCM
void per_token_group_quant_int8(const torch::Tensor& input,
torch::Tensor& output_q,
torch::Tensor& output_s, int64_t group_size,
@ -346,3 +349,4 @@ void per_token_group_quant_int8(const torch::Tensor& input,
per_token_group_quant_8bit(input, output_q, output_s, group_size, eps,
int8_min, int8_max);
}
#endif

View File

@ -86,6 +86,7 @@ D = s_a s_b \widehat A \widehat B
```
Epilogue parameters:
- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
@ -135,7 +136,7 @@ That is precomputed and stored in `azp_with_adj` as a row-vector.
Epilogue parameters:
- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
- Generally this will be per-tensor as the zero-points are per-tensor.
- Generally this will be per-tensor as the zero-points are per-tensor.
- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
- `azp_with_adj` is the precomputed zero-point term ($` z_a J_a \widehat B `$), is per-channel (row-vector).
- `bias` is the bias, is always per-channel (row-vector).
@ -152,7 +153,7 @@ That means the zero-point term $` z_a J_a \widehat B `$ becomes an outer product
Epilogue parameters:
- `scale_a` is the scale for activations, can be per-tensor (scalar) or per-token (column-vector).
- Generally this will be per-token as the zero-points are per-token.
- Generally this will be per-token as the zero-points are per-token.
- `scale_b` is the scale for weights, can be per-tensor (scalar) or per-channel (row-vector).
- `azp_adj` is the precomputed zero-point adjustment term ($` \mathbf 1 \widehat B `$), is per-channel (row-vector).
- `azp` is the zero-point (`z_a`), is per-token (column-vector).

View File

@ -0,0 +1,23 @@
#include "scaled_mm_kernels.hpp"
#include "scaled_mm_blockwise_sm120_fp8_dispatch.cuh"
#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
namespace vllm {
void cutlass_scaled_mm_blockwise_sm120_fp8(torch::Tensor& out,
torch::Tensor const& a,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales) {
if (out.dtype() == torch::kBFloat16) {
cutlass_gemm_blockwise_sm120_fp8_dispatch<cutlass::bfloat16_t>(
out, a, b, a_scales, b_scales);
} else {
TORCH_CHECK(out.dtype() == torch::kFloat16);
cutlass_gemm_blockwise_sm120_fp8_dispatch<cutlass::half_t>(
out, a, b, a_scales, b_scales);
}
}
} // namespace vllm

View File

@ -0,0 +1,183 @@
#pragma once
#include "cuda_utils.h"
#include "cutlass/cutlass.h"
#include "cutlass/numeric_types.h"
#include "cute/tensor.hpp"
#include "cutlass/tensor_ref.h"
#include "cutlass/gemm/dispatch_policy.hpp"
#include "cutlass/gemm/collective/collective_builder.hpp"
#include "cutlass/gemm/device/gemm_universal_adapter.h"
#include "cutlass/gemm/kernel/gemm_universal.hpp"
#include "cutlass/gemm/kernel/tile_scheduler_params.h"
#include "cutlass/epilogue/dispatch_policy.hpp"
#include "cutlass/epilogue/collective/collective_builder.hpp"
#include "cutlass_extensions/gemm/dispatch_policy.hpp"
#include "cutlass_extensions/gemm/collective/collective_builder.hpp"
#include "cutlass_gemm_caller.cuh"
namespace vllm {
using namespace cute;
// clang-format off
template <class OutType, int ScaleGranularityM,
int ScaleGranularityN, int ScaleGranularityK,
class MmaTileShape, class ClusterShape,
class EpilogueScheduler, class MainloopScheduler>
struct cutlass_3x_gemm_fp8_blockwise {
using ElementAB = cutlass::float_e4m3_t;
using ElementA = ElementAB;
using LayoutA = cutlass::layout::RowMajor;
using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose<LayoutA>::type;
static constexpr int AlignmentA = 128 / cutlass::sizeof_bits<ElementA>::value;
using ElementB = ElementAB;
// ColumnMajor is used for B to match the CUTLASS convention.
using LayoutB = cutlass::layout::ColumnMajor;
using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose<LayoutB>::type;
static constexpr int AlignmentB = 128 / cutlass::sizeof_bits<ElementB>::value;
using ElementD = OutType;
using LayoutD = cutlass::layout::RowMajor;
using LayoutD_Transpose = typename cutlass::layout::LayoutTranspose<LayoutD>::type;
static constexpr int AlignmentD = 128 / cutlass::sizeof_bits<ElementD>::value;
using ElementC = void; // TODO: support bias
using LayoutC = LayoutD;
using LayoutC_Transpose = LayoutD_Transpose;
static constexpr int AlignmentC = AlignmentD;
using ElementAccumulator = float;
using ElementCompute = float;
using ElementBlockScale = float;
using ScaleConfig = cutlass::detail::Sm120BlockwiseScaleConfig<
ScaleGranularityM, ScaleGranularityN, ScaleGranularityK,
cute::UMMA::Major::MN, cute::UMMA::Major::K>;
// layout_SFA and layout_SFB cannot be swapped since they are deduced.
using LayoutSFA = decltype(ScaleConfig::deduce_layoutSFA());
using LayoutSFB = decltype(ScaleConfig::deduce_layoutSFB());
using ArchTag = cutlass::arch::Sm120;
using OperatorClass = cutlass::arch::OpClassTensorOp;
static constexpr auto RoundStyle = cutlass::FloatRoundStyle::round_to_nearest;
using ElementScalar = float;
using DefaultOperation = cutlass::epilogue::fusion::LinearCombination<ElementD, ElementCompute, ElementC, ElementScalar, RoundStyle>;
using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
ArchTag,
OperatorClass,
MmaTileShape,
ClusterShape,
cutlass::epilogue::collective::EpilogueTileAuto,
ElementAccumulator,
ElementCompute,
ElementC,
LayoutC,
AlignmentC,
ElementD,
LayoutD,
AlignmentD,
EpilogueScheduler,
DefaultOperation
>::CollectiveOp;
using StageCountType = cutlass::gemm::collective::StageCountAuto;
using CollectiveMainloop =
typename cutlass::gemm::collective::CollectiveBuilder<
ArchTag,
OperatorClass,
ElementA,
cute::tuple<LayoutA, LayoutSFA>,
AlignmentA,
ElementB,
cute::tuple<LayoutB, LayoutSFB>,
AlignmentB,
ElementAccumulator,
MmaTileShape,
ClusterShape,
cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
MainloopScheduler
>::CollectiveOp;
using KernelType = enable_sm120_only<cutlass::gemm::kernel::GemmUniversal<
Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue>>;
struct GemmKernel : public KernelType {};
};
template <typename Gemm>
void cutlass_gemm_caller_blockwise(torch::Tensor& out, torch::Tensor const& a,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales) {
using GemmKernel = typename Gemm::GemmKernel;
using StrideA = typename Gemm::GemmKernel::StrideA;
using StrideB = typename Gemm::GemmKernel::StrideB;
using StrideD = typename Gemm::GemmKernel::StrideD;
using StrideC = typename Gemm::GemmKernel::StrideC;
using LayoutSFA = typename Gemm::LayoutSFA;
using LayoutSFB = typename Gemm::LayoutSFB;
using ScaleConfig = typename Gemm::ScaleConfig;
using ElementAB = typename Gemm::ElementAB;
using ElementD = typename Gemm::ElementD;
int32_t m = a.size(0), n = b.size(1), k = a.size(1);
StrideA a_stride;
StrideB b_stride;
StrideC c_stride;
a_stride =
cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1));
b_stride =
cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1));
c_stride =
cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(m, n, 1));
LayoutSFA layout_SFA =
ScaleConfig::tile_atom_to_shape_SFA(make_shape(m, n, k, 1));
LayoutSFB layout_SFB =
ScaleConfig::tile_atom_to_shape_SFB(make_shape(m, n, k, 1));
auto a_ptr = static_cast<ElementAB*>(a.data_ptr());
auto b_ptr = static_cast<ElementAB*>(b.data_ptr());
auto a_scales_ptr = static_cast<float*>(a_scales.data_ptr());
auto b_scales_ptr = static_cast<float*>(b_scales.data_ptr());
auto mainloop_args = [&](){
return typename GemmKernel::MainloopArguments{
a_ptr, a_stride, b_ptr, b_stride,
a_scales_ptr, layout_SFA, b_scales_ptr, layout_SFB
};
}();
auto prob_shape = cute::make_shape(m, n, k, 1);
auto c_ptr = static_cast<ElementD*>(out.data_ptr());
typename GemmKernel::EpilogueArguments epilogue_args{
{}, c_ptr, c_stride, c_ptr, c_stride};
c3x::cutlass_gemm_caller<GemmKernel>(a.device(), prob_shape, mainloop_args,
epilogue_args);
}
template <typename OutType>
void cutlass_gemm_blockwise_sm120_fp8_dispatch(torch::Tensor& out,
torch::Tensor const& a,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales) {
// TODO: better heuristics
cutlass_gemm_caller_blockwise<cutlass_3x_gemm_fp8_blockwise<
OutType, 1, 128, 128, Shape<_128, _128, _128>,
Shape<_1, _1, _1>, cutlass::epilogue::collective::EpilogueScheduleAuto,
cutlass::gemm::collective::KernelScheduleAuto>>(
out, a, b, a_scales, b_scales);
}
} // namespace vllm

View File

@ -47,4 +47,10 @@ void cutlass_scaled_mm_blockwise_sm100_fp8(torch::Tensor& out,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales);
void cutlass_scaled_mm_blockwise_sm120_fp8(torch::Tensor& out,
torch::Tensor const& a,
torch::Tensor const& b,
torch::Tensor const& a_scales,
torch::Tensor const& b_scales);
} // namespace vllm

View File

@ -1,6 +1,5 @@
#include "scaled_mm_kernels.hpp"
#include "scaled_mm_sm90_fp8_dispatch.cuh"
#include "cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp"
namespace vllm {
@ -13,11 +12,11 @@ void cutlass_scaled_mm_sm90_fp8(torch::Tensor& out, torch::Tensor const& a,
if (bias) {
TORCH_CHECK(bias->dtype() == out.dtype(),
"currently bias dtype must match output dtype ", out.dtype());
return cutlass_scaled_mm_sm90_fp8_epilogue<c3x::ScaledEpilogueBias>(
out, a, b, a_scales, b_scales, *bias);
return cutlass_scaled_mm_sm90_fp8_epilogue<true>(out, a, b, a_scales,
b_scales, *bias);
} else {
return cutlass_scaled_mm_sm90_fp8_epilogue<c3x::ScaledEpilogue>(
out, a, b, a_scales, b_scales);
return cutlass_scaled_mm_sm90_fp8_epilogue<false>(out, a, b, a_scales,
b_scales);
}
}

Some files were not shown because too many files have changed in this diff Show More